Skip to content

Commit

Permalink
Fix misalignment of table output for multi-byte chars (microsoft#268) (
Browse files Browse the repository at this point in the history
…microsoft#313)

* Fix misalignment of table output for multi-byte chars

* Use ICU library to implement Utility::UTF8Length and Utility::UTF8Substring

* Use Unicode ellipsis character in TableOutput instead of three dots

* Move ICU functionality into a separate type with wil::unique_any fields.

* Fix incorrect error log

Co-authored-by: John McPherson <[email protected]>
  • Loading branch information
chausner and JohnMcPMS authored May 27, 2020
1 parent 1320343 commit 2e1c86d
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 8 deletions.
16 changes: 8 additions & 8 deletions src/AppInstallerCLICore/TableOutput.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace AppInstaller::CLI::Execution
for (size_t i = 0; i < FieldCount; ++i)
{
m_columns[i].Name = std::move(header[i]);
m_columns[i].MinLength = m_columns[i].Name.length();
m_columns[i].MinLength = Utility::UTF8Length(m_columns[i].Name);
m_columns[i].MaxLength = 0;
}
}
Expand Down Expand Up @@ -91,7 +91,7 @@ namespace AppInstaller::CLI::Execution
{
for (size_t i = 0; i < FieldCount; ++i)
{
m_columns[i].MaxLength = std::max(m_columns[i].MaxLength, line[i].length());
m_columns[i].MaxLength = std::max(m_columns[i].MaxLength, Utility::UTF8Length(line[i]));
}
}

Expand Down Expand Up @@ -187,12 +187,12 @@ namespace AppInstaller::CLI::Execution

if (col.MaxLength)
{
if (line[i].length() > col.MaxLength)
size_t valueLength = Utility::UTF8Length(line[i]);

if (valueLength > col.MaxLength)
{
size_t replaceChars = std::min(col.MaxLength, static_cast<size_t>(3));
std::string replacement = line[i].substr(0, col.MaxLength - replaceChars);
replacement.append(replaceChars, '.');
out << replacement;
out << Utility::UTF8Substring(line[i], 0, col.MaxLength - 1);
out << "\xE2\x80\xA6"; // UTF8 encoding of ellipsis (…) character

if (col.SpaceAfter)
{
Expand All @@ -205,7 +205,7 @@ namespace AppInstaller::CLI::Execution

if (col.SpaceAfter)
{
out << std::string(col.MaxLength - line[i].length() + 1, ' ');
out << std::string(col.MaxLength - valueLength + 1, ' ');
}
}
}
Expand Down
29 changes: 29 additions & 0 deletions src/AppInstallerCLITests/Strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,35 @@
using namespace AppInstaller::Utility;


TEST_CASE("UTF8Length", "[strings]")
{
REQUIRE(UTF8Length("") == 0);
REQUIRE(UTF8Length("a") == 1);
REQUIRE(UTF8Length(" a b c ") == 7);
REQUIRE(UTF8Length("K\xC3\xA4se") == 4); // "Käse"
REQUIRE(UTF8Length("bye\xE2\x80\xA6") == 4); // "bye…"
REQUIRE(UTF8Length("\xf0\x9f\xa6\x86") == 1); // [duck emoji]
REQUIRE(UTF8Length("\xf0\x9d\x85\xa0\xf0\x9d\x85\xa0") == 2); // [8th note][8th note]
}

TEST_CASE("UTF8Substring", "[strings]")
{
REQUIRE(UTF8Substring("", 0, 0) == "");
REQUIRE(UTF8Substring("abcd", 0, 4) == "abcd");
REQUIRE(UTF8Substring("abcd", 0, 2) == "ab");
REQUIRE(UTF8Substring("abcd", 1, 0) == "");
REQUIRE(UTF8Substring("abcd", 1, 1) == "b");
REQUIRE(UTF8Substring("abcd", 1, 3) == "bcd");
REQUIRE(UTF8Substring("abcd", 4, 0) == "");

const char* s = "\xf0\x9f\xa6\x86s like \xf0\x9f\x8c\x8a"; // [duck emoji]s like [wave emoji]
REQUIRE(UTF8Substring(s, 0, 9) == "\xf0\x9f\xa6\x86s like \xf0\x9f\x8c\x8a");
REQUIRE(UTF8Substring(s, 0, 1) == "\xf0\x9f\xa6\x86");
REQUIRE(UTF8Substring(s, 0, 2) == "\xf0\x9f\xa6\x86s");
REQUIRE(UTF8Substring(s, 1, 7) == "s like ");
REQUIRE(UTF8Substring(s, 1, 8) == "s like \xf0\x9f\x8c\x8a");
}

TEST_CASE("Normalize", "[strings]")
{
REQUIRE(Normalize("test") == "test");
Expand Down
93 changes: 93 additions & 0 deletions src/AppInstallerCommonCore/AppInstallerStrings.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "pch.h"
#include "Public/AppInstallerLogging.h"
#include "Public/AppInstallerStrings.h"
#include "icu.h"

namespace AppInstaller::Utility
{
Expand All @@ -12,6 +14,55 @@ namespace AppInstaller::Utility
constexpr std::string_view s_SpaceChars = AICLI_SPACE_CHARS;
constexpr std::wstring_view s_WideSpaceChars = L"" AICLI_SPACE_CHARS;

namespace
{
// Contains the ICU objects necessary to do break iteration.
struct ICUBreakIterator
{
ICUBreakIterator(std::string_view input, UBreakIteratorType type)
{
UErrorCode err = U_ZERO_ERROR;

m_text.reset(utext_openUTF8(nullptr, input.data(), wil::safe_cast<int64_t>(input.length()), &err));
if (U_FAILURE(err))
{
AICLI_LOG(Core, Error, << "utext_openUTF8 returned " << err);
THROW_HR(E_UNEXPECTED);
}

m_brk.reset(ubrk_open(type, nullptr, nullptr, 0, &err));
if (U_FAILURE(err))
{
AICLI_LOG(Core, Error, << "ubrk_open returned " << err);
THROW_HR(E_UNEXPECTED);
}

ubrk_setUText(m_brk.get(), m_text.get(), &err);
if (U_FAILURE(err))
{
AICLI_LOG(Core, Error, << "ubrk_setUText returned " << err);
THROW_HR(E_UNEXPECTED);
}

int32_t i = ubrk_first(m_brk.get());
if (i != 0)
{
AICLI_LOG(Core, Error, << "ubrk_first returned " << i);
THROW_HR(E_UNEXPECTED);
}
}

int32_t Next()
{
return ubrk_next(m_brk.get());
}

private:
wil::unique_any<UText*, decltype(utext_close), &utext_close> m_text;
wil::unique_any<UBreakIterator*, decltype(ubrk_close), &ubrk_close> m_brk;
};
}

bool CaseInsensitiveEquals(std::string_view a, std::string_view b)
{
// TODO: When we bring in ICU, do this correctly.
Expand Down Expand Up @@ -58,6 +109,48 @@ namespace AppInstaller::Utility
return result;
}

size_t UTF8Length(std::string_view input)
{
ICUBreakIterator itr{ input, UBRK_CHARACTER };

size_t numGraphemeClusters = 0;

while (itr.Next() != UBRK_DONE)
{
numGraphemeClusters++;
}

return numGraphemeClusters;
}

std::string_view UTF8Substring(std::string_view input, size_t offset, size_t count)
{
ICUBreakIterator itr{ input, UBRK_CHARACTER };

size_t utf8Offset = 0;
size_t utf8Count = 0;
size_t graphemeClusterOffset = 0;
int32_t i = 0;

while (i != UBRK_DONE)
{
if (graphemeClusterOffset == offset)
{
utf8Offset = i;
}
else if (graphemeClusterOffset == offset + count)
{
utf8Count = i - utf8Offset;
break;
}

i = itr.Next();
graphemeClusterOffset++;
}

return input.substr(utf8Offset, utf8Count);
}

std::string Normalize(std::string_view input, NORM_FORM form)
{
if (input.empty())
Expand Down
6 changes: 6 additions & 0 deletions src/AppInstallerCommonCore/Public/AppInstallerStrings.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ namespace AppInstaller::Utility
// Converts the given UTF8 string to UTF16
std::wstring ConvertToUTF16(std::string_view input);

// Returns the number of grapheme clusters (characters) in an UTF8-encoded string.
size_t UTF8Length(std::string_view input);

// Returns a substring view in an UTF8-encoded string. Offset and count are measured in grapheme clusters (characters).
std::string_view UTF8Substring(std::string_view input, size_t offset, size_t count);

// Normalizes a UTF8 string to the given form.
std::string Normalize(std::string_view input, NORM_FORM form = NORM_FORM::NormalizationKC);

Expand Down

0 comments on commit 2e1c86d

Please sign in to comment.