coqui-ai · erogol · May 16, 2023 · Apr 26, 2023 · May 3, 2023 · May 8, 2023
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -50,6 +50,6 @@ jobs:
           python3 -m pip install .[all]
           python3 setup.py egg_info
       - name: Unit tests
-        run: |
-          export COQUI_STUDIO_TOKEN=${{ secrets.COQUI_STUDIO_TOKEN }}
-          make inference_tests
+        run: make inference_tests
+        env:
+          COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,7 @@ RUN pip3 install llvmlite --ignore-installed
 WORKDIR /root
 COPY . /root
 RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN rm -rf /root/.cache/pip
 RUN make install
 ENTRYPOINT ["tts"]
 CMD ["--help"]
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 
 ## 🐸Coqui.ai News
-- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/edit/dev/README.md#-python-api)
+- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
 - 📣 Voice generation with prompts - **Prompt to Voice** - is live on Coqui.ai!! [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
 - 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
 <br>
@@ -103,6 +103,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
 - YourTTS: [paper](https://arxiv.org/abs/2112.02418)
+- Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
 
 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@@ -312,7 +313,7 @@ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy",
 
 #### Multi-speaker Models
 
-- List the available speakers and choose as <speaker_id> among them:
+- List the available speakers and choose a <speaker_id> among them:
 
     ```
     $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -220,6 +220,36 @@
                     "license": "apache 2.0",
                     "contact": "[email protected]"
                 }
+
+            },
+            "multi-dataset":{
+                "tortoise-v2":{
+                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
+                    "github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
+                                    ],
+                    "commit": "c1875f6",
+                    "default_vocoder": null,
+                    "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
+                    "license": "apache 2.0"
+                }
+            },
+            "jenny": {
+                "jenny":{
+                    "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
+                    "default_vocoder": null,
+                    "commit": "ba40a1c",
+                    "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
+                    "author": "@noml4u"
+                }
             }
         },
         "es": {

diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.13.3
+0.14.0
diff --git a/TTS/api.py b/TTS/api.py
@@ -342,10 +342,14 @@ def list_models():
 
     def download_model_by_name(self, model_name: str):
         model_path, config_path, model_item = self.manager.download_model(model_name)
+        if isinstance(model_item["github_rls_url"], list):
+            # return model directory if there are multiple files
+            # we assume that the model knows how to load itself
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None, None
+            return model_path, config_path, None, None, None
         vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
-        return model_path, config_path, vocoder_path, vocoder_config_path
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
 
     def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
         """Load one of the voice conversion models by name.
@@ -355,7 +359,7 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, _, _ = self.download_model_by_name(model_name)
+        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
         self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
 
     def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
@@ -374,7 +378,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
         if "coqui_studio" in model_name:
             self.csapi = CS_API()
         else:
-            model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+            model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+                model_name
+            )
 
             # init synthesizer
             # None values are fetch from the model
@@ -387,6 +393,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
                 vocoder_config=vocoder_config_path,
                 encoder_checkpoint=None,
                 encoder_config=None,
+                model_dir=model_dir,
                 use_cuda=gpu,
             )
 
@@ -422,6 +429,7 @@ def _check_arguments(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = None,
+        **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
         if not self.is_coqui_studio:
@@ -430,7 +438,7 @@ def _check_arguments(
                 raise ValueError("Model is multi-speaker but no `speaker` is provided.")
             if self.is_multi_lingual and language is None:
                 raise ValueError("Model is multi-lingual but no `language` is provided.")
-            if not self.is_multi_speaker and speaker is not None:
+            if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
                 raise ValueError("Model is not multi-speaker but `speaker` is provided.")
             if not self.is_multi_lingual and language is not None:
                 raise ValueError("Model is not multi-lingual but `language` is provided.")
@@ -499,6 +507,7 @@ def tts(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = None,
+        **kwargs,
     ):
         """Convert text to speech.
 
@@ -520,12 +529,13 @@ def tts(
                 Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
                 Defaults to None.
         """
-        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
+        self._check_arguments(
+            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
+        )
         if self.csapi is not None:
             return self.tts_coqui_studio(
                 text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
             )
-
         wav = self.synthesizer.tts(
             text=text,
             speaker_name=speaker,
@@ -535,6 +545,7 @@ def tts(
             style_wav=None,
             style_text=None,
             reference_speaker_name=None,
+            **kwargs,
         )
         return wav
 
@@ -547,6 +558,7 @@ def tts_to_file(
         emotion: str = "Neutral",
         speed: float = 1.0,
         file_path: str = "output.wav",
+        **kwargs,
     ):
         """Convert text to speech.
 
@@ -569,13 +581,13 @@ def tts_to_file(
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
         """
-        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
 
         if self.csapi is not None:
             return self.tts_coqui_studio(
                 text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
             )
-        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
+        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
         self.synthesizer.save_wav(wav=wav, path=file_path)
         return file_path
 

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
@@ -16,7 +16,7 @@ def adjust_path_and_remove_silence(audio_path):
     output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
     # ignore if the file exists
     if os.path.exists(output_path) and not args.force:
-        return output_path
+        return output_path, False
 
     # create all directory structure
     pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
@@ -28,7 +28,6 @@ def adjust_path_and_remove_silence(audio_path):
         trim_just_beginning_and_end=args.trim_just_beginning_and_end,
         use_cuda=args.use_cuda,
     )
-
     return output_path, is_speech
 
 
@@ -70,7 +69,7 @@ def preprocess_audios():
         # write files that do not have speech
         with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
             for file in filtered_files:
-                f.write(file + "\n")
+                f.write(str(file) + "\n")
     else:
         print("> No files Found !")
 
@@ -79,10 +78,8 @@ def preprocess_audios():
     parser = argparse.ArgumentParser(
         description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
     )
-    parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
-    parser.add_argument(
-        "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
-    )
+    parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
+    parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
     parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
     parser.add_argument(
         "-g",
@@ -118,6 +115,10 @@ def preprocess_audios():
         help="Number of processes to use",
     )
     args = parser.parse_args()
+
+    if args.output_dir == "":
+        args.output_dir = args.input_dir
+
     # load the model and utils
-    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
     preprocess_audios()
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -274,6 +274,13 @@ def main():
         help="Target audio file to convert in the voice of the source_wav",
     )
 
+    parser.add_argument(
+        "--voice_dir",
+        type=str,
+        default=None,
+        help="Voice dir for tortoise model",
+    )
+
     args = parser.parse_args()
 
     # print the description if either text or list_models is not set
@@ -306,6 +313,7 @@ def main():
     encoder_config_path = None
     vc_path = None
     vc_config_path = None
+    model_dir = None
 
     # CASE1 #list : list pre-trained TTS models
     if args.list_models:
@@ -335,7 +343,6 @@ def main():
     # CASE4: load pre-trained model paths
     if args.model_name is not None and not args.model_path:
         model_path, config_path, model_item = manager.download_model(args.model_name)
-
         # tts model
         if model_item["model_type"] == "tts_models":
             tts_path = model_path
@@ -348,6 +355,13 @@ def main():
             vc_path = model_path
             vc_config_path = config_path
 
+        # tts model with multiple files to be loaded from the directory path
+        if isinstance(model_item["github_rls_url"], list):
+            model_dir = model_path
+            tts_path = None
+            tts_config_path = None
+            args.vocoder_name = None
+
     # load vocoder
     if args.vocoder_name is not None and not args.vocoder_path:
         vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
@@ -379,6 +393,8 @@ def main():
         encoder_config_path,
         vc_path,
         vc_config_path,
+        model_dir,
+        args.voice_dir,
         args.use_cuda,
     )
 
@@ -427,6 +443,8 @@ def main():
             source_wav=args.source_wav,
             target_wav=args.target_wav,
         )
+    elif model_dir is not None:
+        wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))