Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐦 v0.14.0 #2617

Merged
merged 15 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/inference_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@ jobs:
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: |
export COQUI_STUDIO_TOKEN=${{ secrets.COQUI_STUDIO_TOKEN }}
make inference_tests
run: make inference_tests
env:
COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ RUN pip3 install llvmlite --ignore-installed
WORKDIR /root
COPY . /root
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
RUN rm -rf /root/.cache/pip
RUN make install
ENTRYPOINT ["tts"]
CMD ["--help"]
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@


## 🐸Coqui.ai News
- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/edit/dev/README.md#-python-api)
- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on Coqui.ai!! [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
- 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
<br>
Expand Down Expand Up @@ -103,6 +103,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
### End-to-End Models
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
- YourTTS: [paper](https://arxiv.org/abs/2112.02418)
- Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)

### Attention Methods
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
Expand Down Expand Up @@ -312,7 +313,7 @@ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy",

#### Multi-speaker Models

- List the available speakers and choose as <speaker_id> among them:
- List the available speakers and choose a <speaker_id> among them:

```
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
Expand Down
30 changes: 30 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,36 @@
"license": "apache 2.0",
"contact": "[email protected]"
}

},
"multi-dataset":{
"tortoise-v2":{
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"commit": "c1875f6",
"default_vocoder": null,
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
"license": "apache 2.0"
}
},
"jenny": {
"jenny":{
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
"default_vocoder": null,
"commit": "ba40a1c",
"license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
"author": "@noml4u"
}
}
},
"es": {
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.13.3
0.14.0
30 changes: 21 additions & 9 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,10 +342,14 @@ def list_models():

def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if isinstance(model_item["github_rls_url"], list):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
if model_item.get("default_vocoder") is None:
return model_path, config_path, None, None
return model_path, config_path, None, None, None
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
return model_path, config_path, vocoder_path, vocoder_config_path
return model_path, config_path, vocoder_path, vocoder_config_path, None

def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the voice conversion models by name.
Expand All @@ -355,7 +359,7 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.model_name = model_name
model_path, config_path, _, _ = self.download_model_by_name(model_name)
model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)

def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
Expand All @@ -374,7 +378,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
if "coqui_studio" in model_name:
self.csapi = CS_API()
else:
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name
)

# init synthesizer
# None values are fetch from the model
Expand All @@ -387,6 +393,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)

Expand Down Expand Up @@ -422,6 +429,7 @@ def _check_arguments(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
**kwargs,
) -> None:
"""Check if the arguments are valid for the model."""
if not self.is_coqui_studio:
Expand All @@ -430,7 +438,7 @@ def _check_arguments(
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None:
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
Expand Down Expand Up @@ -499,6 +507,7 @@ def tts(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
**kwargs,
):
"""Convert text to speech.

Expand All @@ -520,12 +529,13 @@ def tts(
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
)
if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
)

wav = self.synthesizer.tts(
text=text,
speaker_name=speaker,
Expand All @@ -535,6 +545,7 @@ def tts(
style_wav=None,
style_text=None,
reference_speaker_name=None,
**kwargs,
)
return wav

Expand All @@ -547,6 +558,7 @@ def tts_to_file(
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = "output.wav",
**kwargs,
):
"""Convert text to speech.

Expand All @@ -569,13 +581,13 @@ def tts_to_file(
file_path (str, optional):
Output file path. Defaults to "output.wav".
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
self.synthesizer.save_wav(wav=wav, path=file_path)
return file_path

Expand Down
17 changes: 9 additions & 8 deletions TTS/bin/remove_silence_using_vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def adjust_path_and_remove_silence(audio_path):
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
# ignore if the file exists
if os.path.exists(output_path) and not args.force:
return output_path
return output_path, False

# create all directory structure
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
Expand All @@ -28,7 +28,6 @@ def adjust_path_and_remove_silence(audio_path):
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
use_cuda=args.use_cuda,
)

return output_path, is_speech


Expand Down Expand Up @@ -70,7 +69,7 @@ def preprocess_audios():
# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
for file in filtered_files:
f.write(file + "\n")
f.write(str(file) + "\n")
else:
print("> No files Found !")

Expand All @@ -79,10 +78,8 @@ def preprocess_audios():
parser = argparse.ArgumentParser(
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
)
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
parser.add_argument(
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
)
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
parser.add_argument(
"-g",
Expand Down Expand Up @@ -118,6 +115,10 @@ def preprocess_audios():
help="Number of processes to use",
)
args = parser.parse_args()

if args.output_dir == "":
args.output_dir = args.input_dir

# load the model and utils
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
preprocess_audios()
20 changes: 19 additions & 1 deletion TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,13 @@ def main():
help="Target audio file to convert in the voice of the source_wav",
)

parser.add_argument(
"--voice_dir",
type=str,
default=None,
help="Voice dir for tortoise model",
)

args = parser.parse_args()

# print the description if either text or list_models is not set
Expand Down Expand Up @@ -306,6 +313,7 @@ def main():
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None

# CASE1 #list : list pre-trained TTS models
if args.list_models:
Expand Down Expand Up @@ -335,7 +343,6 @@ def main():
# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)

# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
Expand All @@ -348,6 +355,13 @@ def main():
vc_path = model_path
vc_config_path = config_path

# tts model with multiple files to be loaded from the directory path
if isinstance(model_item["github_rls_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None

# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
Expand Down Expand Up @@ -379,6 +393,8 @@ def main():
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
args.use_cuda,
)

Expand Down Expand Up @@ -427,6 +443,8 @@ def main():
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)

# save the results
print(" > Saving output to {}".format(args.out_path))
Expand Down
Loading