Skip to content

Commit

Permalink
Use piper-phonemize to convert text to token IDs
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Nov 28, 2023
1 parent db41778 commit d212ac0
Show file tree
Hide file tree
Showing 13 changed files with 204 additions and 37 deletions.
13 changes: 12 additions & 1 deletion python-api-examples/offline-tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,25 @@ def get_args():
parser.add_argument(
"--vits-lexicon",
type=str,
default="",
help="Path to lexicon.txt",
)

parser.add_argument(
"--vits-tokens",
type=str,
default="",
help="Path to tokens.txt",
)

parser.add_argument(
"--vits-data-dir",
type=str,
default="",
help="""Path to the dict director of espeak-ng. If it is specified,
--vits-lexicon and --vits-tokens are ignored""",
)

parser.add_argument(
"--tts-rule-fsts",
type=str,
Expand Down Expand Up @@ -142,13 +152,14 @@ def main():
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
model=args.vits_model,
lexicon=args.vits_lexicon,
data_dir=args.vits_data_dir,
tokens=args.vits_tokens,
),
provider=args.provider,
debug=args.debug,
num_threads=args.num_threads,
),
rule_fsts=args.tts_rule_fsts
rule_fsts=args.tts_rule_fsts,
)
tts = sherpa_onnx.OfflineTts(tts_config)

Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ set(sources
packed-sequence.cc
pad-sequence.cc
parse-options.cc
piper-phonemize-lexicon.cc
provider.cc
resample.cc
session.cc
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon,
#endif

std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
const std::string &text) const {
const std::string &text, const std::string & /*voice*/ /*= ""*/) const {
switch (language_) {
case Language::kEnglish:
return ConvertTextToTokenIdsEnglish(text);
Expand Down
5 changes: 4 additions & 1 deletion sherpa-onnx/csrc/lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ namespace sherpa_onnx {
// TODO(fangjun): Refactor it to an abstract class
class Lexicon {
public:
virtual ~Lexicon() = default;
Lexicon() = default; // for subclasses
Lexicon(const std::string &lexicon, const std::string &tokens,
const std::string &punctuations, const std::string &language,
bool debug = false, bool is_piper = false);
Expand All @@ -34,7 +36,8 @@ class Lexicon {
bool is_piper = false);
#endif

std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const;
virtual std::vector<int64_t> ConvertTextToTokenIds(
const std::string &text, const std::string &voice = "") const;

private:
std::vector<int64_t> ConvertTextToTokenIdsGerman(
Expand Down
36 changes: 26 additions & 10 deletions sherpa-onnx/csrc/offline-tts-vits-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "sherpa-onnx/csrc/offline-tts-impl.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
#include "sherpa-onnx/csrc/text-utils.h"

namespace sherpa_onnx {
Expand All @@ -29,10 +30,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
public:
explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config)
: config_(config),
model_(std::make_unique<OfflineTtsVitsModel>(config.model)),
lexicon_(config.model.vits.lexicon, config.model.vits.tokens,
model_->Punctuations(), model_->Language(), config.model.debug,
model_->IsPiper()) {
model_(std::make_unique<OfflineTtsVitsModel>(config.model)) {
InitLexicon();

if (!config.rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(config.rule_fsts, ",", false, &files);
Expand All @@ -50,9 +50,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config)
: config_(config),
model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)),
lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens,
model_->Punctuations(), model_->Language(), config.model.debug,
model_->IsPiper()) {
lexicon_(std::make_unique<Lexicon>(
mgr, config.model.vits.lexicon, config.model.vits.tokens,
model_->Punctuations(), model_->Language(), config.model.debug,
model_->IsPiper())) {
if (!config.rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(config.rule_fsts, ",", false, &files);
Expand Down Expand Up @@ -101,13 +102,14 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
}
}

std::vector<int64_t> x = lexicon_.ConvertTextToTokenIds(text);
std::vector<int64_t> x =
lexicon_->ConvertTextToTokenIds(text, model_->Voice());
if (x.empty()) {
SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str());
return {};
}

if (model_->AddBlank()) {
if (model_->AddBlank() && config_.model.vits.data_dir.empty()) {
std::vector<int64_t> buffer(x.size() * 2 + 1);
int32_t i = 1;
for (auto k : x) {
Expand Down Expand Up @@ -143,11 +145,25 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
return ans;
}

private:
void InitLexicon() {
if (model_->IsPiper() && model_->Language() == "English" &&
!config_.model.vits.data_dir.empty()) {
lexicon_ =
std::make_unique<PiperPhonemizeLexicon>(config_.model.vits.data_dir);
} else {
lexicon_ = std::make_unique<Lexicon>(
config_.model.vits.lexicon, config_.model.vits.tokens,
model_->Punctuations(), model_->Language(), config_.model.debug,
model_->IsPiper());
}
}

private:
OfflineTtsConfig config_;
std::unique_ptr<OfflineTtsVitsModel> model_;
std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
Lexicon lexicon_;
std::unique_ptr<Lexicon> lexicon_;
};

} // namespace sherpa_onnx
Expand Down
60 changes: 45 additions & 15 deletions sherpa-onnx/csrc/offline-tts-vits-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
po->Register("vits-model", &model, "Path to VITS model");
po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models");
po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models");
po->Register("vits-data-dir", &data_dir,
"Path to the directory containing dict for espeak-ng. If it is "
"given, --vits-lexicon and --vits-tokens are ignored.");
po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models");
po->Register("vits-noise-scale-w", &noise_scale_w,
"noise_scale_w for VITS models");
Expand All @@ -31,24 +34,50 @@ bool OfflineTtsVitsModelConfig::Validate() const {
return false;
}

if (lexicon.empty()) {
SHERPA_ONNX_LOGE("Please provide --vits-lexicon");
return false;
}
if (data_dir.empty()) {
if (lexicon.empty()) {
SHERPA_ONNX_LOGE("Please provide --vits-lexicon");
return false;
}

if (!FileExists(lexicon)) {
SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());
return false;
}
if (!FileExists(lexicon)) {
SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());
return false;
}

if (tokens.empty()) {
SHERPA_ONNX_LOGE("Please provide --vits-tokens");
return false;
}
if (tokens.empty()) {
SHERPA_ONNX_LOGE("Please provide --vits-tokens");
return false;
}

if (!FileExists(tokens)) {
SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str());
return false;
if (!FileExists(tokens)) {
SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str());
return false;
}
} else {
if (!FileExists(data_dir + "/phontab")) {
SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test",
data_dir.c_str());
return false;
}

if (!FileExists(data_dir + "/phonindex")) {
SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test",
data_dir.c_str());
return false;
}

if (!FileExists(data_dir + "/phondata")) {
SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test",
data_dir.c_str());
return false;
}

if (!FileExists(data_dir + "/intonations")) {
SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test",
data_dir.c_str());
return false;
}
}

return true;
Expand All @@ -61,6 +90,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const {
os << "model=\"" << model << "\", ";
os << "lexicon=\"" << lexicon << "\", ";
os << "tokens=\"" << tokens << "\", ";
os << "data_dir=\"" << data_dir << "\", ";
os << "noise_scale=" << noise_scale << ", ";
os << "noise_scale_w=" << noise_scale_w << ", ";
os << "length_scale=" << length_scale << ")";
Expand Down
6 changes: 6 additions & 0 deletions sherpa-onnx/csrc/offline-tts-vits-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig {
std::string lexicon;
std::string tokens;

// If data_dir is given, lexicon and tokens are ignored
// data_dir is for piper-phonemize, which uses espeak-ng
std::string data_dir;

float noise_scale = 0.667;
float noise_scale_w = 0.8;
float length_scale = 1;
Expand All @@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig {
OfflineTtsVitsModelConfig(const std::string &model,
const std::string &lexicon,
const std::string &tokens,
const std::string &data_dir,
float noise_scale = 0.667,
float noise_scale_w = 0.8, float length_scale = 1)
: model(model),
lexicon(lexicon),
tokens(tokens),
data_dir(data_dir),
noise_scale(noise_scale),
noise_scale_w(noise_scale_w),
length_scale(length_scale) {}
Expand Down
8 changes: 8 additions & 0 deletions sherpa-onnx/csrc/offline-tts-vits-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl {

std::string Punctuations() const { return punctuations_; }
std::string Language() const { return language_; }
std::string Voice() const { return voice_; }
bool IsPiper() const { return is_piper_; }
int32_t NumSpeakers() const { return num_speakers_; }

Expand Down Expand Up @@ -78,6 +79,11 @@ class OfflineTtsVitsModel::Impl {
SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers");
SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation");
SHERPA_ONNX_READ_META_DATA_STR(language_, "language");
// SHERPA_ONNX_READ_META_DATA_STR(voice_, "voice");
if (language_ == "English") {
// FIXME(fangjun): Read voice from the metadata
voice_ = "en-us";
}

std::string comment;
SHERPA_ONNX_READ_META_DATA_STR(comment, "comment");
Expand Down Expand Up @@ -215,6 +221,7 @@ class OfflineTtsVitsModel::Impl {
int32_t num_speakers_;
std::string punctuations_;
std::string language_;
std::string voice_;

bool is_piper_ = false;
};
Expand Down Expand Up @@ -244,6 +251,7 @@ std::string OfflineTtsVitsModel::Punctuations() const {
}

std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); }
std::string OfflineTtsVitsModel::Voice() const { return impl_->Voice(); }

bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); }

Expand Down
3 changes: 2 additions & 1 deletion sherpa-onnx/csrc/offline-tts-vits-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class OfflineTtsVitsModel {
bool AddBlank() const;

std::string Punctuations() const;
std::string Language() const;
std::string Language() const; // e.g., Chinese, English, German, etc.
std::string Voice() const; // e.g., en-us, for espeak-ng
bool IsPiper() const;
int32_t NumSpeakers() const;

Expand Down
63 changes: 63 additions & 0 deletions sherpa-onnx/csrc/piper-phonemize-lexicon.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// sherpa-onnx/csrc/piper-phonemize-lexicon.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation

#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"

#include <map>
#include <mutex> // NOLINT

#include "espeak-ng/speak_lib.h"
#include "phoneme_ids.hpp"
#include "phonemize.hpp"
#include "sherpa-onnx/csrc/macros.h"

namespace sherpa_onnx {

void InitEspeak(const std::string &data_dir) {
static std::once_flag init_flag;
std::call_once(init_flag, [data_dir]() {
int32_t result =
espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, data_dir.c_str(), 0);
if (result != 22050) {
SHERPA_ONNX_LOGE(
"Failed to initialize espeak-ng with data dir: %s. Return code is: "
"%d",
data_dir.c_str(), result);
exit(-1);
}
});
}

PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &data_dir)
: data_dir_(data_dir) {
InitEspeak(data_dir_);
}

std::vector<int64_t> PiperPhonemizeLexicon::ConvertTextToTokenIds(
const std::string &text, const std::string &voice /*= ""*/) const {
piper::eSpeakPhonemeConfig config;

// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
// to list available voices
config.voice = voice; // e.g., voice is en-us

std::vector<std::vector<piper::Phoneme>> phonemes;
piper::phonemize_eSpeak(text, config, phonemes);

std::vector<piper::PhonemeId> phoneme_ids;
std::map<piper::Phoneme, std::size_t> missing_phonemes;

std::vector<int64_t> ans;
piper::PhonemeIdConfig id_config;
for (const auto &p : phonemes) {
phoneme_ids.clear();
missing_phonemes.clear();
phonemes_to_ids(p, id_config, phoneme_ids, missing_phonemes);
ans.insert(ans.end(), phoneme_ids.begin(), phoneme_ids.end());
}

return ans;
}

} // namespace sherpa_onnx
Loading

0 comments on commit d212ac0

Please sign in to comment.