-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts.py
212 lines (178 loc) · 7.32 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from collections import namedtuple
from TTS.utils.synthesizer import Synthesizer
from IPython.display import Audio
import numpy as np
import re
from src.tts_output import TTSOutput
class TextToSpeech:
SAMPLE_RATE = 22050
synthesizer = None
@staticmethod
def initialize_synthesizer(tts_path, tts_config_path, speakers_file_path):
TextToSpeech.synthesizer = Synthesizer(
tts_checkpoint=tts_path,
tts_config_path=tts_config_path,
tts_speakers_file=speakers_file_path,
use_cuda=False,
)
@staticmethod
def text_to_audio(text, plt=None, speaker_name="p225", speed=1.0):
sum_sp = 0
sum_wi = 0
sentences = re.split(
r"(?<=[.!?]) ", text
) # Split after punctuation followed by a space
# trim each sentence
sentences = [sentence.strip() for sentence in sentences]
combined_output = None
for sentence in sentences:
output = TextToSpeech.synthesizer.tts(
text=sentence, speaker_name=speaker_name, return_extra_outputs=True
)
if speed != 1.0:
new_durations = output[1]["outputs"]["durations"].clone() / speed
output = TextToSpeech.synthesizer.tts(
text=sentence,
speaker_name=speaker_name,
return_extra_outputs=True,
durations=new_durations[0],
)
sentence_output = TTSOutput.from_output(
output, sentence, TextToSpeech.synthesizer
)
print(" ---- ")
print("sentence: " + sentence)
words = str(
len(
TextToSpeech.custom_split(
sentence, len(sentence_output.word_indices)
)
)
)
timestamps = str(len(sentence_output.word_indices))
sum_sp += int(words)
sum_wi += int(timestamps)
print("split: " + words + " word indices: " + timestamps)
print("sum split: " + str(sum_sp) + " sum word indices: " + str(sum_wi))
if words != timestamps:
print("@@@@@@@@ failed ")
print("split: " + words + " word indices: " + timestamps)
if plt:
plt.figure()
TextToSpeech.plot_spectrogram_with_words(
plt, TextToSpeech.add_beeps(sentence_output)
)
print(" ---- ")
print("sentence: " + sentence)
# print TextToSpeech.custom_split(sentence, len(sentence_output.word_indices))
print(
"custom split: "
+ str(
TextToSpeech.custom_split(
sentence, len(sentence_output.word_indices)
)
)
)
# print sentence_output.word_indices
print("word indices: " + str(sentence_output.word_indices))
if words > timestamps:
raise IndexError("failed")
else:
print(
"failed but not raising error because words < timestamps, probably it is verse number, which will always be the last word to show anyway."
)
if combined_output is None:
combined_output = sentence_output
else:
combined_output = combined_output.combine_with(sentence_output)
return combined_output
@staticmethod
def custom_split(text, expected_word_count=-1):
words = re.split("[ -]", text)
if len(words) == expected_word_count:
return words
common_fused_words = [
"to be",
"Do not",
"do not",
"does not",
"that it",
"that the",
"as is",
"for the",
"For the",
"has been",
"of a",
"with the",
"of the",
"did not",
"from the",
"on the",
"I am",
"for an",
"have been",
"I shall",
"in the",
"of events",
# "but I",
#"I will",
"no one",
]
common_fused_words_regex = [
r"\b" + re.escape(word) + r"\b" for word in common_fused_words
]
for i, fused in enumerate(common_fused_words):
regex_pattern = common_fused_words_regex[i]
replacement_pattern = fused.replace(" ", "_")
text = re.sub(regex_pattern, replacement_pattern, text)
words = text.split(" ")
words = [word.replace("_", " ") for word in words]
return words
@staticmethod
def plot_spectrogram_with_words(plt, audio_output):
spec = TextToSpeech.synthesizer.tts_model.ap.melspectrogram(audio_output.audio)
plt.figure(figsize=(20, 5))
plt.imshow(spec, origin="lower", aspect="auto", interpolation="none")
# Calculate the differences between consecutive timestamps to get the durations
phoneme_durations = np.diff(audio_output.phoneme_timestamps, prepend=0)
# Cumulative sum to get the locations of the x-ticks
cumulative_durations = np.cumsum(phoneme_durations)
# Shift by half the duration to center the labels
plt.xticks(cumulative_durations, audio_output.pre_tokenized_text, rotation=0)
for x in audio_output.word_timestamps:
plt.axvline(x, color="red", linewidth=4)
plt.gca().xaxis.tick_top()
plt.title("Word durations")
plt.show()
@staticmethod
def add_beeps(audio_output):
beep = np.sin(
2 * np.pi * 1000 * np.arange(0, 0.1, 1 / TextToSpeech.SAMPLE_RATE)
)
audio_samples = np.array(audio_output.audio)
word_sample_indices = [
int(audio_output.phoneme_timestamps[idx] / 86.0 * TextToSpeech.SAMPLE_RATE)
for idx in audio_output.word_indices
]
for start_sample in word_sample_indices:
end_sample = start_sample + len(beep)
if end_sample > len(audio_samples):
end_sample = len(audio_samples)
beep = beep[: end_sample - start_sample] # Truncate beep if necessary
audio_samples[start_sample:end_sample] += beep
return TTSOutput(
audio=audio_samples,
pre_tokenized_text=audio_output.pre_tokenized_text,
phoneme_timestamps=audio_output.phoneme_timestamps,
total_running_time_s=audio_output.total_running_time_s,
word_timestamps=audio_output.word_timestamps,
word_indices=audio_output.word_indices,
)
# Paths and Parameters
# TODO use pyenv to set these paths
tts_path = "/Users/ben/Library/Application Support/tts/tts_models--en--vctk--vits/model_file.pth"
tts_config_path = "/Users/ben/Library/Application Support/tts/tts_models--en--vctk--vits/config.json"
speakers_file_path = "/Users/ben/Library/Application Support/tts/tts_models--en--vctk--vits/speaker_ids.json"
text = "hello world; this is an example sentence"
# Initialization and execution
TextToSpeech.initialize_synthesizer(tts_path, tts_config_path, speakers_file_path)