Skip to content

Commit

Permalink
Fix passing gb2312 encoded strings to tts on Windows (#1819)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Feb 8, 2025
1 parent 51b4274 commit d38cb81
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 0 deletions.
20 changes: 20 additions & 0 deletions sherpa-onnx/csrc/offline-tts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default;
GeneratedAudio OfflineTts::Generate(
const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/,
GeneratedAudioCallback callback /*= nullptr*/) const {
#if !defined(_WIN32)
return impl_->Generate(text, sid, speed, std::move(callback));
#else
if (IsUtf8(text)) {
return impl_->Generate(text, sid, speed, std::move(callback));
} else if (IsGB2312(text)) {
auto utf8_text = Gb2312ToUtf8(text);
static bool printed = false;
if (!printed) {
SHERPA_ONNX_LOGE(
"Detected GB2312 encoded string! Converting it to UTF8.");
printed = true;
}
return impl_->Generate(utf8_text, sid, speed, std::move(callback));
} else {
SHERPA_ONNX_LOGE(
"Non UTF8 encoded string is received. You would not get expected "
"results!");
return impl_->Generate(text, sid, speed, std::move(callback));
}
#endif
}

int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); }
Expand Down
123 changes: 123 additions & 0 deletions sherpa-onnx/csrc/text-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include <utility>
#include <vector>

#if defined(_WIN32)
#include <Windows.h>
#endif

#include "sherpa-onnx/csrc/macros.h"

// This file is copied/modified from
Expand Down Expand Up @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text,
return ans;
}

bool IsUtf8(const std::string &text) {
int32_t n = static_cast<int32_t>(text.size());
int32_t i = 0;
const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
while (i < n) {
if (p[i] <= 0x7f) {
i += 1;
continue;
}

if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n &&
InRange(p[i + 1], 0x80, 0xbf)) {
i += 2;
continue;
}

if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) &&
InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}

if (InRange(p[i], 0xe1, 0xec) && i + 2 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}

if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) &&
InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}

if (InRange(p[i], 0xee, 0xef) && i + 2 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}

if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) &&
InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}

if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) &&
InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}

if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) &&
InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}

return false;
}

return true;
}

bool IsGB2312(const std::string &text) {
int32_t n = static_cast<int32_t>(text.size());
int32_t i = 0;
const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
while (i < n) {
if (p[i] <= 0x7f) {
i += 1;
continue;
}

if (InRange(p[i], 0xa1, 0xf7) && i + 1 < n &&
InRange(p[i + 1], 0xa1, 0xfe)) {
i += 2;
continue;
}

return false;
}

return true;
}

#if defined(_WIN32)
std::string Gb2312ToUtf8(const std::string &text) {
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar
// 936 is from
// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
// GB2312 -> 936
int32_t num_wchars =
MultiByteToWideChar(936, 0, text.c_str(), text.size(), nullptr, 0);
SHERPA_ONNX_LOGE("num of wchars: %d", num_wchars);
if (num_wchars == 0) {
return {};
}

std::wstring wstr;
wstr.resize(num_wchars);
MultiByteToWideChar(936, 0, text.c_str(), text.size(), wstr.data(),
num_wchars);
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
int32_t num_chars = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr,
0, nullptr, nullptr);
if (num_chars == 0) {
return {};
}

std::string ans(num_chars, 0);
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, ans.data(), num_chars,
nullptr, nullptr);

return ans;
}
#endif

} // namespace sherpa_onnx
12 changes: 12 additions & 0 deletions sherpa-onnx/csrc/text-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out);
std::string RemoveInvalidUtf8Sequences(const std::string &text,
bool show_debug_msg = false);

// Return true if text contains valid utf8 sequence.
// Return false otherwise
bool IsUtf8(const std::string &text);

// Return true if text contains valid gb2312 encoded sequence
// Return false otherwise
bool IsGB2312(const std::string &text);

#if defined(_WIN32)
std::string Gb2312ToUtf8(const std::string &text);
#endif

} // namespace sherpa_onnx

#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_

0 comments on commit d38cb81

Please sign in to comment.