Skip to content

Commit

Permalink
Embed audio samples in generated tutorials
Browse files Browse the repository at this point in the history
It turned out that generated tutorials can embed the audio if the following conditions are met.
This commit changes how audio samples are shown in tutorials so that they become playable in doc.

1. There is only one `IPython.display.Audio` call in a cell
2. `IPython.display.Audio` is the last function called in the cell
3. Audio format is `wav`
   (`flac` can be contained, but browsers (Chrome/Safari) won't play it)

Ref: https://stackoverflow.com/a/33109647
  • Loading branch information
mthrok committed Nov 5, 2021
1 parent b3c2cfc commit 7315e9e
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 14 deletions.
6 changes: 3 additions & 3 deletions examples/gallery/tts/tacotron2_pipeline_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def text_to_sequence(text):
ax2.plot(waveforms[0].cpu().detach())

torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
IPython.display.display(IPython.display.Audio("output_wavernn.wav"))
IPython.display.Audio("output_wavernn.wav")


######################################################################
Expand Down Expand Up @@ -299,7 +299,7 @@ def text_to_sequence(text):
ax2.plot(waveforms[0].cpu().detach())

torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
IPython.display.display(IPython.display.Audio("output_griffinlim.wav"))
IPython.display.Audio("output_griffinlim.wav")


######################################################################
Expand Down Expand Up @@ -330,4 +330,4 @@ def text_to_sequence(text):
ax2.plot(waveforms[0].cpu().detach())

torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
IPython.display.display(IPython.display.Audio("output_waveglow.wav"))
IPython.display.Audio("output_waveglow.wav")
71 changes: 62 additions & 9 deletions examples/gallery/wav2vec2/forced_alignment_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@
print(torchaudio.__version__)
print(device)

SPEECH_URL = 'https://download.pytorch.org/torchaudio/test-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac'
SPEECH_FILE = 'speech.flac'
SPEECH_URL = 'https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'
SPEECH_FILE = 'speech.wav'

if not os.path.exists(SPEECH_FILE):
with open(SPEECH_FILE, 'wb') as file:
Expand Down Expand Up @@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform):
plot_alignments(trellis, segments, word_segments, waveform[0],)
plt.show()

# Generate the audio for each segment
print(transcript)
IPython.display.display(IPython.display.Audio(SPEECH_FILE))
ratio = waveform.size(1) / (trellis.size(0) - 1)
for i, word in enumerate(word_segments):
# A trick to embed the resulting audio to the generated file.
# `IPython.display.Audio` has to be the last call in a cell,
# and there should be only one call.
def _show(i):
ratio = waveform.size(1) / (trellis.size(0) - 1)
word = word_segments[i]
x0 = int(ratio * word.start)
x1 = int(ratio * word.end)
filename = f"{i}_{word.label}.wav"
torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
print(f"{word.label}: {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f}")
IPython.display.display(IPython.display.Audio(filename))
print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
return filename

######################################################################
#

# Generate the audio for each segment
print(transcript)
IPython.display.Audio(SPEECH_FILE)


######################################################################
#

IPython.display.Audio(_show(0))

######################################################################
#

IPython.display.Audio(_show(1))

######################################################################
#

IPython.display.Audio(_show(2))

######################################################################
#

IPython.display.Audio(_show(3))

######################################################################
#

IPython.display.Audio(_show(4))

######################################################################
#

IPython.display.Audio(_show(5))

######################################################################
#

IPython.display.Audio(_show(6))

######################################################################
#

IPython.display.Audio(_show(7))

######################################################################
#

IPython.display.Audio(_show(8))

######################################################################
# Conclusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
# Creative Commos BY 4.0.
#

IPython.display.display(IPython.display.Audio(SPEECH_FILE))
IPython.display.Audio(SPEECH_FILE)


######################################################################
Expand Down Expand Up @@ -273,7 +273,7 @@ def forward(self, emission: torch.Tensor) -> str:
#

print(transcript)
IPython.display.display(IPython.display.Audio(SPEECH_FILE))
IPython.display.Audio(SPEECH_FILE)


######################################################################
Expand Down

0 comments on commit 7315e9e

Please sign in to comment.