Embed audio samples in generated tutorials

It turned out that generated tutorials can embed the audio if the following conditions are met. This commit changes how audio samples are shown in tutorials so that they become playable in doc. 1. There is only one `IPython.display.Audio` call in a cell 2. `IPython.display.Audio` is the last function called in the cell 3. Audio format is `wav` (`flac` can be contained, but browsers (Chrome/Safari) won't play it) Ref: https://stackoverflow.com/a/33109647
pytorch · Nov 5, 2021 · 7315e9e · 7315e9e
1 parent b3c2cfc
commit 7315e9e
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 14 deletions.
diff --git a/examples/gallery/tts/tacotron2_pipeline_tutorial.py b/examples/gallery/tts/tacotron2_pipeline_tutorial.py
@@ -270,7 +270,7 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_wavernn.wav"))
+IPython.display.Audio("output_wavernn.wav")
 
 
 ######################################################################
@@ -299,7 +299,7 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_griffinlim.wav"))
+IPython.display.Audio("output_griffinlim.wav")
 
 
 ######################################################################
@@ -330,4 +330,4 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
-IPython.display.display(IPython.display.Audio("output_waveglow.wav"))
+IPython.display.Audio("output_waveglow.wav")
diff --git a/examples/gallery/wav2vec2/forced_alignment_tutorial.py b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
@@ -56,8 +56,8 @@
 print(torchaudio.__version__)
 print(device)
 
-SPEECH_URL = 'https://download.pytorch.org/torchaudio/test-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac'
-SPEECH_FILE = 'speech.flac'
+SPEECH_URL = 'https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'
+SPEECH_FILE = 'speech.wav'
 
 if not os.path.exists(SPEECH_FILE):
   with open(SPEECH_FILE, 'wb') as file:
@@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 plot_alignments(trellis, segments, word_segments, waveform[0],)
 plt.show()
 
-# Generate the audio for each segment
-print(transcript)
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
-ratio = waveform.size(1) / (trellis.size(0) - 1)
-for i, word in enumerate(word_segments):
+# A trick to embed the resulting audio to the generated file.
+# `IPython.display.Audio` has to be the last call in a cell,
+# and there should be only one call.
+def _show(i):
+  ratio = waveform.size(1) / (trellis.size(0) - 1)
+  word = word_segments[i]
   x0 = int(ratio * word.start)
   x1 = int(ratio * word.end)
   filename = f"{i}_{word.label}.wav"
   torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
-  print(f"{word.label}: {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f}")
-  IPython.display.display(IPython.display.Audio(filename))
+  print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
+  return filename
+
+######################################################################
+# 
+
+# Generate the audio for each segment
+print(transcript)
+IPython.display.Audio(SPEECH_FILE)
+
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(0))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(1))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(2))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(3))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(4))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(5))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(6))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(7))
+
+######################################################################
+# 
 
+IPython.display.Audio(_show(8))
 
 ######################################################################
 # Conclusion

diff --git a/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py b/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
@@ -120,7 +120,7 @@
 # Creative Commos BY 4.0.
 # 
 
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 
 
 ######################################################################
@@ -273,7 +273,7 @@ def forward(self, emission: torch.Tensor) -> str:
 # 
 
 print(transcript)
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 
 
 ######################################################################