diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index ec2c69523..d858b71b1 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default; GeneratedAudio OfflineTts::Generate( const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/, GeneratedAudioCallback callback /*= nullptr*/) const { +#if !defined(_WIN32) return impl_->Generate(text, sid, speed, std::move(callback)); +#else + if (IsUtf8(text)) { + return impl_->Generate(text, sid, speed, std::move(callback)); + } else if (IsGB2312(text)) { + auto utf8_text = Gb2312ToUtf8(text); + static bool printed = false; + if (!printed) { + SHERPA_ONNX_LOGE( + "Detected GB2312 encoded string! Converting it to UTF8."); + printed = true; + } + return impl_->Generate(utf8_text, sid, speed, std::move(callback)); + } else { + SHERPA_ONNX_LOGE( + "Non UTF8 encoded string is received. You would not get expected " + "results!"); + return impl_->Generate(text, sid, speed, std::move(callback)); + } +#endif } int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); } diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index 7259ed7c4..d0a64a8ce 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -16,6 +16,10 @@ #include #include +#if defined(_WIN32) +#include +#endif + #include "sherpa-onnx/csrc/macros.h" // This file is copied/modified from @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text, return ans; } +bool IsUtf8(const std::string &text) { + int32_t n = static_cast(text.size()); + int32_t i = 0; + const uint8_t *p = reinterpret_cast(text.data()); + while (i < n) { + if (p[i] <= 0x7f) { + i += 1; + continue; + } + + if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n && + InRange(p[i + 1], 0x80, 0xbf)) { + i += 2; + continue; + } + + if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) && + InRange(p[i + 2], 0x80, 0xbf)) { + i += 3; + continue; + } + + if (InRange(p[i], 0xe1, 0xec) && i + 2 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { + i += 3; + continue; + } + + if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) && + InRange(p[i + 2], 0x80, 0xbf)) { + i += 3; + continue; + } + + if (InRange(p[i], 0xee, 0xef) && i + 2 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { + i += 3; + continue; + } + + if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) && + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { + i += 4; + continue; + } + + if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) && + InRange(p[i + 3], 0x80, 0xbf)) { + i += 4; + continue; + } + + if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) && + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { + i += 4; + continue; + } + + return false; + } + + return true; +} + +bool IsGB2312(const std::string &text) { + int32_t n = static_cast(text.size()); + int32_t i = 0; + const uint8_t *p = reinterpret_cast(text.data()); + while (i < n) { + if (p[i] <= 0x7f) { + i += 1; + continue; + } + + if (InRange(p[i], 0xa1, 0xf7) && i + 1 < n && + InRange(p[i + 1], 0xa1, 0xfe)) { + i += 2; + continue; + } + + return false; + } + + return true; +} + +#if defined(_WIN32) +std::string Gb2312ToUtf8(const std::string &text) { + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar + // 936 is from + // https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers + // GB2312 -> 936 + int32_t num_wchars = + MultiByteToWideChar(936, 0, text.c_str(), text.size(), nullptr, 0); + SHERPA_ONNX_LOGE("num of wchars: %d", num_wchars); + if (num_wchars == 0) { + return {}; + } + + std::wstring wstr; + wstr.resize(num_wchars); + MultiByteToWideChar(936, 0, text.c_str(), text.size(), wstr.data(), + num_wchars); + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte + int32_t num_chars = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, + 0, nullptr, nullptr); + if (num_chars == 0) { + return {}; + } + + std::string ans(num_chars, 0); + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, ans.data(), num_chars, + nullptr, nullptr); + + return ans; +} +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils.h b/sherpa-onnx/csrc/text-utils.h index a27137060..f0ecdb8e6 100644 --- a/sherpa-onnx/csrc/text-utils.h +++ b/sherpa-onnx/csrc/text-utils.h @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out); std::string RemoveInvalidUtf8Sequences(const std::string &text, bool show_debug_msg = false); +// Return true if text contains valid utf8 sequence. +// Return false otherwise +bool IsUtf8(const std::string &text); + +// Return true if text contains valid gb2312 encoded sequence +// Return false otherwise +bool IsGB2312(const std::string &text); + +#if defined(_WIN32) +std::string Gb2312ToUtf8(const std::string &text); +#endif + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_