From 2e1c86d859d09a65ddd154497ef1173bca0f38bb Mon Sep 17 00:00:00 2001 From: chausner Date: Wed, 27 May 2020 22:09:54 +0200 Subject: [PATCH] Fix misalignment of table output for multi-byte chars (#268) (#313) * Fix misalignment of table output for multi-byte chars * Use ICU library to implement Utility::UTF8Length and Utility::UTF8Substring * Use Unicode ellipsis character in TableOutput instead of three dots * Move ICU functionality into a separate type with wil::unique_any fields. * Fix incorrect error log Co-authored-by: John McPherson --- src/AppInstallerCLICore/TableOutput.h | 16 ++-- src/AppInstallerCLITests/Strings.cpp | 29 ++++++ .../AppInstallerStrings.cpp | 93 +++++++++++++++++++ .../Public/AppInstallerStrings.h | 6 ++ 4 files changed, 136 insertions(+), 8 deletions(-) diff --git a/src/AppInstallerCLICore/TableOutput.h b/src/AppInstallerCLICore/TableOutput.h index 14592d3e01791..1bf7891b1a49b 100644 --- a/src/AppInstallerCLICore/TableOutput.h +++ b/src/AppInstallerCLICore/TableOutput.h @@ -40,7 +40,7 @@ namespace AppInstaller::CLI::Execution for (size_t i = 0; i < FieldCount; ++i) { m_columns[i].Name = std::move(header[i]); - m_columns[i].MinLength = m_columns[i].Name.length(); + m_columns[i].MinLength = Utility::UTF8Length(m_columns[i].Name); m_columns[i].MaxLength = 0; } } @@ -91,7 +91,7 @@ namespace AppInstaller::CLI::Execution { for (size_t i = 0; i < FieldCount; ++i) { - m_columns[i].MaxLength = std::max(m_columns[i].MaxLength, line[i].length()); + m_columns[i].MaxLength = std::max(m_columns[i].MaxLength, Utility::UTF8Length(line[i])); } } @@ -187,12 +187,12 @@ namespace AppInstaller::CLI::Execution if (col.MaxLength) { - if (line[i].length() > col.MaxLength) + size_t valueLength = Utility::UTF8Length(line[i]); + + if (valueLength > col.MaxLength) { - size_t replaceChars = std::min(col.MaxLength, static_cast(3)); - std::string replacement = line[i].substr(0, col.MaxLength - replaceChars); - replacement.append(replaceChars, '.'); - out << replacement; + out << Utility::UTF8Substring(line[i], 0, col.MaxLength - 1); + out << "\xE2\x80\xA6"; // UTF8 encoding of ellipsis (…) character if (col.SpaceAfter) { @@ -205,7 +205,7 @@ namespace AppInstaller::CLI::Execution if (col.SpaceAfter) { - out << std::string(col.MaxLength - line[i].length() + 1, ' '); + out << std::string(col.MaxLength - valueLength + 1, ' '); } } } diff --git a/src/AppInstallerCLITests/Strings.cpp b/src/AppInstallerCLITests/Strings.cpp index 6f76c0d097aba..9edc2a2f6c4fb 100644 --- a/src/AppInstallerCLITests/Strings.cpp +++ b/src/AppInstallerCLITests/Strings.cpp @@ -7,6 +7,35 @@ using namespace AppInstaller::Utility; +TEST_CASE("UTF8Length", "[strings]") +{ + REQUIRE(UTF8Length("") == 0); + REQUIRE(UTF8Length("a") == 1); + REQUIRE(UTF8Length(" a b c ") == 7); + REQUIRE(UTF8Length("K\xC3\xA4se") == 4); // "Käse" + REQUIRE(UTF8Length("bye\xE2\x80\xA6") == 4); // "bye…" + REQUIRE(UTF8Length("\xf0\x9f\xa6\x86") == 1); // [duck emoji] + REQUIRE(UTF8Length("\xf0\x9d\x85\xa0\xf0\x9d\x85\xa0") == 2); // [8th note][8th note] +} + +TEST_CASE("UTF8Substring", "[strings]") +{ + REQUIRE(UTF8Substring("", 0, 0) == ""); + REQUIRE(UTF8Substring("abcd", 0, 4) == "abcd"); + REQUIRE(UTF8Substring("abcd", 0, 2) == "ab"); + REQUIRE(UTF8Substring("abcd", 1, 0) == ""); + REQUIRE(UTF8Substring("abcd", 1, 1) == "b"); + REQUIRE(UTF8Substring("abcd", 1, 3) == "bcd"); + REQUIRE(UTF8Substring("abcd", 4, 0) == ""); + + const char* s = "\xf0\x9f\xa6\x86s like \xf0\x9f\x8c\x8a"; // [duck emoji]s like [wave emoji] + REQUIRE(UTF8Substring(s, 0, 9) == "\xf0\x9f\xa6\x86s like \xf0\x9f\x8c\x8a"); + REQUIRE(UTF8Substring(s, 0, 1) == "\xf0\x9f\xa6\x86"); + REQUIRE(UTF8Substring(s, 0, 2) == "\xf0\x9f\xa6\x86s"); + REQUIRE(UTF8Substring(s, 1, 7) == "s like "); + REQUIRE(UTF8Substring(s, 1, 8) == "s like \xf0\x9f\x8c\x8a"); +} + TEST_CASE("Normalize", "[strings]") { REQUIRE(Normalize("test") == "test"); diff --git a/src/AppInstallerCommonCore/AppInstallerStrings.cpp b/src/AppInstallerCommonCore/AppInstallerStrings.cpp index f482277024a23..29b7e4ef4854e 100644 --- a/src/AppInstallerCommonCore/AppInstallerStrings.cpp +++ b/src/AppInstallerCommonCore/AppInstallerStrings.cpp @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. #include "pch.h" +#include "Public/AppInstallerLogging.h" #include "Public/AppInstallerStrings.h" +#include "icu.h" namespace AppInstaller::Utility { @@ -12,6 +14,55 @@ namespace AppInstaller::Utility constexpr std::string_view s_SpaceChars = AICLI_SPACE_CHARS; constexpr std::wstring_view s_WideSpaceChars = L"" AICLI_SPACE_CHARS; + namespace + { + // Contains the ICU objects necessary to do break iteration. + struct ICUBreakIterator + { + ICUBreakIterator(std::string_view input, UBreakIteratorType type) + { + UErrorCode err = U_ZERO_ERROR; + + m_text.reset(utext_openUTF8(nullptr, input.data(), wil::safe_cast(input.length()), &err)); + if (U_FAILURE(err)) + { + AICLI_LOG(Core, Error, << "utext_openUTF8 returned " << err); + THROW_HR(E_UNEXPECTED); + } + + m_brk.reset(ubrk_open(type, nullptr, nullptr, 0, &err)); + if (U_FAILURE(err)) + { + AICLI_LOG(Core, Error, << "ubrk_open returned " << err); + THROW_HR(E_UNEXPECTED); + } + + ubrk_setUText(m_brk.get(), m_text.get(), &err); + if (U_FAILURE(err)) + { + AICLI_LOG(Core, Error, << "ubrk_setUText returned " << err); + THROW_HR(E_UNEXPECTED); + } + + int32_t i = ubrk_first(m_brk.get()); + if (i != 0) + { + AICLI_LOG(Core, Error, << "ubrk_first returned " << i); + THROW_HR(E_UNEXPECTED); + } + } + + int32_t Next() + { + return ubrk_next(m_brk.get()); + } + + private: + wil::unique_any m_text; + wil::unique_any m_brk; + }; + } + bool CaseInsensitiveEquals(std::string_view a, std::string_view b) { // TODO: When we bring in ICU, do this correctly. @@ -58,6 +109,48 @@ namespace AppInstaller::Utility return result; } + size_t UTF8Length(std::string_view input) + { + ICUBreakIterator itr{ input, UBRK_CHARACTER }; + + size_t numGraphemeClusters = 0; + + while (itr.Next() != UBRK_DONE) + { + numGraphemeClusters++; + } + + return numGraphemeClusters; + } + + std::string_view UTF8Substring(std::string_view input, size_t offset, size_t count) + { + ICUBreakIterator itr{ input, UBRK_CHARACTER }; + + size_t utf8Offset = 0; + size_t utf8Count = 0; + size_t graphemeClusterOffset = 0; + int32_t i = 0; + + while (i != UBRK_DONE) + { + if (graphemeClusterOffset == offset) + { + utf8Offset = i; + } + else if (graphemeClusterOffset == offset + count) + { + utf8Count = i - utf8Offset; + break; + } + + i = itr.Next(); + graphemeClusterOffset++; + } + + return input.substr(utf8Offset, utf8Count); + } + std::string Normalize(std::string_view input, NORM_FORM form) { if (input.empty()) diff --git a/src/AppInstallerCommonCore/Public/AppInstallerStrings.h b/src/AppInstallerCommonCore/Public/AppInstallerStrings.h index a568768d08eab..e0b5cf637bcaa 100644 --- a/src/AppInstallerCommonCore/Public/AppInstallerStrings.h +++ b/src/AppInstallerCommonCore/Public/AppInstallerStrings.h @@ -17,6 +17,12 @@ namespace AppInstaller::Utility // Converts the given UTF8 string to UTF16 std::wstring ConvertToUTF16(std::string_view input); + // Returns the number of grapheme clusters (characters) in an UTF8-encoded string. + size_t UTF8Length(std::string_view input); + + // Returns a substring view in an UTF8-encoded string. Offset and count are measured in grapheme clusters (characters). + std::string_view UTF8Substring(std::string_view input, size_t offset, size_t count); + // Normalizes a UTF8 string to the given form. std::string Normalize(std::string_view input, NORM_FORM form = NORM_FORM::NormalizationKC);