Skip to content

Commit

Permalink
minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
strega-nil committed Jan 26, 2022
1 parent da8b942 commit 6aeff52
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 47 deletions.
29 changes: 17 additions & 12 deletions include/vcpkg/base/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,29 @@ namespace vcpkg::Unicode
StartFour = 4,
};

enum class utf8_errc
{
NoError = 0,
InvalidCodeUnit = 1,
InvalidCodePoint = 2,
PairedSurrogates = 3,
UnexpectedContinue = 4,
UnexpectedStart = 5,
UnexpectedEof = 6,
};

Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept;
int utf8_code_unit_count(char code_unit) noexcept;

int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;

const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept;
// returns {after-current-code-point, error},
// and if error = NoError, then out = parsed code point.
// else, out = end_of_file.
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
const char* last,
char32_t& out) noexcept;

inline std::string& utf8_append_code_point(std::string& str, char32_t code_point)
{
Expand Down Expand Up @@ -56,17 +72,6 @@ namespace vcpkg::Unicode

constexpr static char32_t end_of_file = 0xFFFF'FFFF;

enum class utf8_errc
{
NoError = 0,
InvalidCodeUnit = 1,
InvalidCodePoint = 2,
PairedSurrogates = 3,
UnexpectedContinue = 4,
UnexpectedStart = 5,
UnexpectedEof = 6,
};

const std::error_category& utf8_category() noexcept;

inline std::error_code make_error_code(utf8_errc err) noexcept
Expand Down
63 changes: 29 additions & 34 deletions src/vcpkg/base/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,28 +92,39 @@ namespace vcpkg::Unicode
return count;
}

const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
const char* last,
char32_t& out) noexcept
{
out = end_of_file;
if (first == last)
{
return first;
return {last, utf8_errc::NoError};
}

auto code_unit = *first;
auto kind = utf8_code_unit_kind(code_unit);
const int count = utf8_code_unit_count(kind);

if (kind == Utf8CodeUnitKind::Invalid || kind == Utf8CodeUnitKind::Continue || count > last - first)
const char* it = first + 1;

if (kind == Utf8CodeUnitKind::Invalid)
{
return first;
return {it, utf8_errc::InvalidCodeUnit};
}
else if (kind == Utf8CodeUnitKind::Continue)
{
return {it, utf8_errc::UnexpectedContinue};
}
else if (count > last - first)
{
return {last, utf8_errc::UnexpectedEof};
}
const char* it = first + 1;

if (count == 1)
{
out = static_cast<char32_t>(code_unit);
return it;
return {it, utf8_errc::NoError};
}

// 2 -> 0b0001'1111, 6
Expand All @@ -131,18 +142,23 @@ namespace vcpkg::Unicode
kind = utf8_code_unit_kind(code_unit);
if (kind == Utf8CodeUnitKind::Invalid)
{
return it;
return {it, utf8_errc::InvalidCodeUnit};
}
else if (kind != Utf8CodeUnitKind::Continue)
{
return it;
return {it, utf8_errc::UnexpectedStart};
}

const int shift = 6 * (count - byte - 1);
code_point |= (code_unit & continue_mask) << shift;
}
out = code_point;
return it;

if (code_point > 0x10'FFFF)
{
return {it, utf8_errc::InvalidCodePoint};
}

return {it, utf8_errc::NoError};
}

bool utf8_is_valid_string(const char* first, const char* last) noexcept
Expand Down Expand Up @@ -228,31 +244,10 @@ namespace vcpkg::Unicode

char32_t code_point;
auto new_next = utf8_decode_code_point(next_, last_, code_point);
if (code_point == end_of_file)
{
const char* old_next = next_;
*this = sentinel();
if (new_next == last_)
{
return utf8_errc::NoError;
}
if (new_next == old_next)
{
auto kind = utf8_code_unit_kind(*new_next);
if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit;
if (kind == Utf8CodeUnitKind::Continue) return utf8_errc::UnexpectedContinue;
return utf8_errc::UnexpectedEof;
}

auto kind = utf8_code_unit_kind(*new_next);
if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit;
return utf8_errc::UnexpectedStart;
}

if (code_point > 0x10'FFFF)
if (new_next.second != utf8_errc::NoError)
{
*this = sentinel();
return utf8_errc::InvalidCodePoint;
return new_next.second;
}

if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_))
Expand All @@ -261,7 +256,7 @@ namespace vcpkg::Unicode
return utf8_errc::PairedSurrogates;
}

next_ = new_next;
next_ = new_next.first;
current_ = code_point;
return utf8_errc::NoError;
}
Expand Down
4 changes: 3 additions & 1 deletion src/vcpkg/sourceparagraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,9 +866,11 @@ namespace vcpkg
{
char32_t ch;
auto char_last = Unicode::utf8_decode_code_point(start, sv.end(), ch);
// unicode errors should be impossible, since the utf8 is correct by JSON construction
Checks::check_exit(VCPKG_LINE_INFO, char_last.second == Unicode::utf8_errc::NoError);
return msg::format(msgLicenseExpressionContainsUnicode,
msg::value = ch,
msg::pretty_value = StringView{start, char_last});
msg::pretty_value = StringView{start, char_last.first});
}

return msg::format(msgLicenseExpressionContainsInvalidCharacter, msg::value = *start);
Expand Down

0 comments on commit 6aeff52

Please sign in to comment.