From e16259c37d15131b2384580ef80f5d8c8b070431 Mon Sep 17 00:00:00 2001 From: Larry Gritz Date: Wed, 26 Jan 2022 20:33:50 -0800 Subject: [PATCH] utf8 / utf16 conversion modernization. (#3307) Our Strutil utilities for utf8 <-> utf16 conversion were based on Windows specific calls that we had added even before C++11 was our minimum. Replace the Windows-specific code with std calls, and expose them in strutil.h for all platforms (why not?). --- src/include/OpenImageIO/strutil.h | 44 ++++----------------- src/libutil/strutil.cpp | 64 ++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 56 deletions(-) diff --git a/src/include/OpenImageIO/strutil.h b/src/include/OpenImageIO/strutil.h index bd9ea43e76..65f1b6c02e 100644 --- a/src/include/OpenImageIO/strutil.h +++ b/src/include/OpenImageIO/strutil.h @@ -792,46 +792,16 @@ struct OIIO_UTIL_API StringILess { -#ifdef _WIN32 -/// Conversion functions between UTF-8 and UTF-16 for windows. -/// -/// For historical reasons, the standard encoding for strings on windows is -/// UTF-16, whereas the unix world seems to have settled on UTF-8. These two -/// encodings can be stored in std::string and std::wstring respectively, with -/// the caveat that they're both variable-width encodings, so not all the -/// standard string methods will make sense (for example std::string::size() -/// won't return the number of glyphs in a UTF-8 string, unless it happens to -/// be made up of only the 7-bit ASCII subset). -/// -/// The standard windows API functions usually have two versions, a UTF-16 -/// version with a 'W' suffix (using wchar_t* strings), and an ANSI version -/// with a 'A' suffix (using char* strings) which uses the current windows -/// code page to define the encoding. (To make matters more confusing there is -/// also a further "TCHAR" version which is #defined to the UTF-16 or ANSI -/// version, depending on whether UNICODE is defined during compilation. -/// This is meant to make it possible to support compiling libraries in -/// either unicode or ansi mode from the same codebase.) -/// -/// Using std::string as the string container (as in OIIO) implies that we -/// can't use UTF-16. It also means we need a variable-width encoding to -/// represent characters in non-Latin alphabets in an unambiguous way; the -/// obvious candidate is UTF-8. File paths in OIIO are considered to be -/// represented in UTF-8, and must be converted to UTF-16 before passing to -/// windows API file opening functions. - -/// On the other hand, the encoding used for the ANSI versions of the windows -/// API is the current windows code page. This is more compatible with the -/// default setup of the standard windows command prompt, and may be more -/// appropriate for error messages. - -// Conversion to wide char -// +/// Conversion of normal char-based strings (presumed to be UTF-8 encoding) +/// to wide char string, wstring. std::wstring OIIO_UTIL_API utf8_to_utf16 (string_view utf8str) noexcept; -// Conversion from wide char -// +/// Conversion from wstring UTF-16 to a UTF-8 std::string. This is the +/// standard way to convert from Windows wide character strings used for +/// filenames into the UTF-8 strings OIIO expects for filenames when passed to +/// functions like ImageInput::open(). std::string OIIO_UTIL_API utf16_to_utf8(const std::wstring& utf16str) noexcept; -#endif + /// Copy at most size characters (including terminating 0 character) from diff --git a/src/libutil/strutil.cpp b/src/libutil/strutil.cpp index e634f7e4d2..584e9827b6 100644 --- a/src/libutil/strutil.cpp +++ b/src/libutil/strutil.cpp @@ -4,13 +4,14 @@ #include +#include #include #include #include #include #include #include -#include +#include #include #include #include @@ -744,18 +745,46 @@ Strutil::replace(string_view str, string_view pattern, string_view replacement, -#ifdef _WIN32 +// Conversion functions between UTF-8 and UTF-16 for windows. +// +// For historical reasons, the standard encoding for strings on windows is +// UTF-16, whereas the unix world seems to have settled on UTF-8. These two +// encodings can be stored in std::string and std::wstring respectively, with +// the caveat that they're both variable-width encodings, so not all the +// standard string methods will make sense (for example std::string::size() +// won't return the number of glyphs in a UTF-8 string, unless it happens to +// be made up of only the 7-bit ASCII subset). +// +// The standard windows API functions usually have two versions, a UTF-16 +// version with a 'W' suffix (using wchar_t* strings), and an ANSI version +// with a 'A' suffix (using char* strings) which uses the current windows +// code page to define the encoding. (To make matters more confusing there is +// also a further "TCHAR" version which is #defined to the UTF-16 or ANSI +// version, depending on whether UNICODE is defined during compilation. +// This is meant to make it possible to support compiling libraries in +// either unicode or ansi mode from the same codebase.) +// +// Using std::string as the string container (as in OIIO) implies that we +// can't use UTF-16. It also means we need a variable-width encoding to +// represent characters in non-Latin alphabets in an unambiguous way; the +// obvious candidate is UTF-8. File paths in OIIO are considered to be +// represented in UTF-8, and must be converted to UTF-16 before passing to +// windows API file opening functions. +// +// On the other hand, the encoding used for the ANSI versions of the windows +// API is the current windows code page. This is more compatible with the +// default setup of the standard windows command prompt, and may be more +// appropriate for error messages. + std::wstring Strutil::utf8_to_utf16(string_view str) noexcept { - std::wstring native; - - native.resize( - MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), NULL, 0)); - MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), &native[0], - (int)native.size()); - - return native; + try { + std::wstring_convert, wchar_t> conv; + return conv.from_bytes(str.data(), str.data() + str.size()); + } catch (const std::exception&) { + return std::wstring(); + } } @@ -763,16 +792,13 @@ Strutil::utf8_to_utf16(string_view str) noexcept std::string Strutil::utf16_to_utf8(const std::wstring& str) noexcept { - std::string utf8; - - utf8.resize(WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(), NULL, - 0, NULL, NULL)); - WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(), &utf8[0], - (int)utf8.size(), NULL, NULL); - - return utf8; + try { + std::wstring_convert, wchar_t> conv; + return conv.to_bytes(str); + } catch (const std::exception&) { + return std::string(); + } } -#endif