From 6aeff5290f03b75ca16b59452188e2ff453fb85f Mon Sep 17 00:00:00 2001 From: nicole mazzuca Date: Wed, 26 Jan 2022 12:22:34 -0800 Subject: [PATCH] minor changes --- include/vcpkg/base/unicode.h | 29 +++++++++------- src/vcpkg/base/unicode.cpp | 63 ++++++++++++++++------------------- src/vcpkg/sourceparagraph.cpp | 4 ++- 3 files changed, 49 insertions(+), 47 deletions(-) diff --git a/include/vcpkg/base/unicode.h b/include/vcpkg/base/unicode.h index 076bf63669..2265e9699b 100644 --- a/include/vcpkg/base/unicode.h +++ b/include/vcpkg/base/unicode.h @@ -14,13 +14,29 @@ namespace vcpkg::Unicode StartFour = 4, }; + enum class utf8_errc + { + NoError = 0, + InvalidCodeUnit = 1, + InvalidCodePoint = 2, + PairedSurrogates = 3, + UnexpectedContinue = 4, + UnexpectedStart = 5, + UnexpectedEof = 6, + }; + Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept; int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept; int utf8_code_unit_count(char code_unit) noexcept; int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept; - const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept; + // returns {after-current-code-point, error}, + // and if error = NoError, then out = parsed code point. + // else, out = end_of_file. + std::pair utf8_decode_code_point(const char* first, + const char* last, + char32_t& out) noexcept; inline std::string& utf8_append_code_point(std::string& str, char32_t code_point) { @@ -56,17 +72,6 @@ namespace vcpkg::Unicode constexpr static char32_t end_of_file = 0xFFFF'FFFF; - enum class utf8_errc - { - NoError = 0, - InvalidCodeUnit = 1, - InvalidCodePoint = 2, - PairedSurrogates = 3, - UnexpectedContinue = 4, - UnexpectedStart = 5, - UnexpectedEof = 6, - }; - const std::error_category& utf8_category() noexcept; inline std::error_code make_error_code(utf8_errc err) noexcept diff --git a/src/vcpkg/base/unicode.cpp b/src/vcpkg/base/unicode.cpp index 1bc8813da6..ee31c03707 100644 --- a/src/vcpkg/base/unicode.cpp +++ b/src/vcpkg/base/unicode.cpp @@ -92,28 +92,39 @@ namespace vcpkg::Unicode return count; } - const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept + std::pair utf8_decode_code_point(const char* first, + const char* last, + char32_t& out) noexcept { out = end_of_file; if (first == last) { - return first; + return {last, utf8_errc::NoError}; } auto code_unit = *first; auto kind = utf8_code_unit_kind(code_unit); const int count = utf8_code_unit_count(kind); - if (kind == Utf8CodeUnitKind::Invalid || kind == Utf8CodeUnitKind::Continue || count > last - first) + const char* it = first + 1; + + if (kind == Utf8CodeUnitKind::Invalid) { - return first; + return {it, utf8_errc::InvalidCodeUnit}; + } + else if (kind == Utf8CodeUnitKind::Continue) + { + return {it, utf8_errc::UnexpectedContinue}; + } + else if (count > last - first) + { + return {last, utf8_errc::UnexpectedEof}; } - const char* it = first + 1; if (count == 1) { out = static_cast(code_unit); - return it; + return {it, utf8_errc::NoError}; } // 2 -> 0b0001'1111, 6 @@ -131,18 +142,23 @@ namespace vcpkg::Unicode kind = utf8_code_unit_kind(code_unit); if (kind == Utf8CodeUnitKind::Invalid) { - return it; + return {it, utf8_errc::InvalidCodeUnit}; } else if (kind != Utf8CodeUnitKind::Continue) { - return it; + return {it, utf8_errc::UnexpectedStart}; } const int shift = 6 * (count - byte - 1); code_point |= (code_unit & continue_mask) << shift; } - out = code_point; - return it; + + if (code_point > 0x10'FFFF) + { + return {it, utf8_errc::InvalidCodePoint}; + } + + return {it, utf8_errc::NoError}; } bool utf8_is_valid_string(const char* first, const char* last) noexcept @@ -228,31 +244,10 @@ namespace vcpkg::Unicode char32_t code_point; auto new_next = utf8_decode_code_point(next_, last_, code_point); - if (code_point == end_of_file) - { - const char* old_next = next_; - *this = sentinel(); - if (new_next == last_) - { - return utf8_errc::NoError; - } - if (new_next == old_next) - { - auto kind = utf8_code_unit_kind(*new_next); - if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit; - if (kind == Utf8CodeUnitKind::Continue) return utf8_errc::UnexpectedContinue; - return utf8_errc::UnexpectedEof; - } - - auto kind = utf8_code_unit_kind(*new_next); - if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit; - return utf8_errc::UnexpectedStart; - } - - if (code_point > 0x10'FFFF) + if (new_next.second != utf8_errc::NoError) { *this = sentinel(); - return utf8_errc::InvalidCodePoint; + return new_next.second; } if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_)) @@ -261,7 +256,7 @@ namespace vcpkg::Unicode return utf8_errc::PairedSurrogates; } - next_ = new_next; + next_ = new_next.first; current_ = code_point; return utf8_errc::NoError; } diff --git a/src/vcpkg/sourceparagraph.cpp b/src/vcpkg/sourceparagraph.cpp index 525f58efbf..39320449f4 100644 --- a/src/vcpkg/sourceparagraph.cpp +++ b/src/vcpkg/sourceparagraph.cpp @@ -866,9 +866,11 @@ namespace vcpkg { char32_t ch; auto char_last = Unicode::utf8_decode_code_point(start, sv.end(), ch); + // unicode errors should be impossible, since the utf8 is correct by JSON construction + Checks::check_exit(VCPKG_LINE_INFO, char_last.second == Unicode::utf8_errc::NoError); return msg::format(msgLicenseExpressionContainsUnicode, msg::value = ch, - msg::pretty_value = StringView{start, char_last}); + msg::pretty_value = StringView{start, char_last.first}); } return msg::format(msgLicenseExpressionContainsInvalidCharacter, msg::value = *start);