From 6aeff5290f03b75ca16b59452188e2ff453fb85f Mon Sep 17 00:00:00 2001
From: nicole mazzuca <mazzucan@outlook.com>
Date: Wed, 26 Jan 2022 12:22:34 -0800
Subject: [PATCH] minor changes

---
 include/vcpkg/base/unicode.h  | 29 +++++++++-------
 src/vcpkg/base/unicode.cpp    | 63 ++++++++++++++++-------------------
 src/vcpkg/sourceparagraph.cpp |  4 ++-
 3 files changed, 49 insertions(+), 47 deletions(-)
diff --git a/include/vcpkg/base/unicode.h b/include/vcpkg/base/unicode.h
index 076bf63669..2265e9699b 100644
--- a/include/vcpkg/base/unicode.h
+++ b/include/vcpkg/base/unicode.h
@@ -14,13 +14,29 @@ namespace vcpkg::Unicode
         StartFour = 4,
     };
 
+    enum class utf8_errc
+    {
+        NoError = 0,
+        InvalidCodeUnit = 1,
+        InvalidCodePoint = 2,
+        PairedSurrogates = 3,
+        UnexpectedContinue = 4,
+        UnexpectedStart = 5,
+        UnexpectedEof = 6,
+    };
+
     Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
     int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept;
     int utf8_code_unit_count(char code_unit) noexcept;
 
     int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
 
-    const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept;
+    // returns {after-current-code-point, error},
+    // and if error = NoError, then out = parsed code point.
+    // else, out = end_of_file.
+    std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
+                                                             const char* last,
+                                                             char32_t& out) noexcept;
 
     inline std::string& utf8_append_code_point(std::string& str, char32_t code_point)
     {
@@ -56,17 +72,6 @@ namespace vcpkg::Unicode
 
     constexpr static char32_t end_of_file = 0xFFFF'FFFF;
 
-    enum class utf8_errc
-    {
-        NoError = 0,
-        InvalidCodeUnit = 1,
-        InvalidCodePoint = 2,
-        PairedSurrogates = 3,
-        UnexpectedContinue = 4,
-        UnexpectedStart = 5,
-        UnexpectedEof = 6,
-    };
-
     const std::error_category& utf8_category() noexcept;
 
     inline std::error_code make_error_code(utf8_errc err) noexcept
diff --git a/src/vcpkg/base/unicode.cpp b/src/vcpkg/base/unicode.cpp
index 1bc8813da6..ee31c03707 100644
--- a/src/vcpkg/base/unicode.cpp
+++ b/src/vcpkg/base/unicode.cpp
@@ -92,28 +92,39 @@ namespace vcpkg::Unicode
         return count;
     }
 
-    const char* utf8_decode_code_point(const char* first, const char* last, char32_t& out) noexcept
+    std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
+                                                             const char* last,
+                                                             char32_t& out) noexcept
     {
         out = end_of_file;
         if (first == last)
         {
-            return first;
+            return {last, utf8_errc::NoError};
         }
 
         auto code_unit = *first;
         auto kind = utf8_code_unit_kind(code_unit);
         const int count = utf8_code_unit_count(kind);
 
-        if (kind == Utf8CodeUnitKind::Invalid || kind == Utf8CodeUnitKind::Continue || count > last - first)
+        const char* it = first + 1;
+
+        if (kind == Utf8CodeUnitKind::Invalid)
         {
-            return first;
+            return {it, utf8_errc::InvalidCodeUnit};
+        }
+        else if (kind == Utf8CodeUnitKind::Continue)
+        {
+            return {it, utf8_errc::UnexpectedContinue};
+        }
+        else if (count > last - first)
+        {
+            return {last, utf8_errc::UnexpectedEof};
         }
-        const char* it = first + 1;
 
         if (count == 1)
         {
             out = static_cast<char32_t>(code_unit);
-            return it;
+            return {it, utf8_errc::NoError};
         }
 
         // 2 -> 0b0001'1111, 6
@@ -131,18 +142,23 @@ namespace vcpkg::Unicode
             kind = utf8_code_unit_kind(code_unit);
             if (kind == Utf8CodeUnitKind::Invalid)
             {
-                return it;
+                return {it, utf8_errc::InvalidCodeUnit};
             }
             else if (kind != Utf8CodeUnitKind::Continue)
             {
-                return it;
+                return {it, utf8_errc::UnexpectedStart};
             }
 
             const int shift = 6 * (count - byte - 1);
             code_point |= (code_unit & continue_mask) << shift;
         }
-        out = code_point;
-        return it;
+
+        if (code_point > 0x10'FFFF)
+        {
+            return {it, utf8_errc::InvalidCodePoint};
+        }
+
+        return {it, utf8_errc::NoError};
     }
 
     bool utf8_is_valid_string(const char* first, const char* last) noexcept
@@ -228,31 +244,10 @@ namespace vcpkg::Unicode
 
         char32_t code_point;
         auto new_next = utf8_decode_code_point(next_, last_, code_point);
-        if (code_point == end_of_file)
-        {
-            const char* old_next = next_;
-            *this = sentinel();
-            if (new_next == last_)
-            {
-                return utf8_errc::NoError;
-            }
-            if (new_next == old_next)
-            {
-                auto kind = utf8_code_unit_kind(*new_next);
-                if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit;
-                if (kind == Utf8CodeUnitKind::Continue) return utf8_errc::UnexpectedContinue;
-                return utf8_errc::UnexpectedEof;
-            }
-
-            auto kind = utf8_code_unit_kind(*new_next);
-            if (kind == Utf8CodeUnitKind::Invalid) return utf8_errc::InvalidCodeUnit;
-            return utf8_errc::UnexpectedStart;
-        }
-
-        if (code_point > 0x10'FFFF)
+        if (new_next.second != utf8_errc::NoError)
         {
             *this = sentinel();
-            return utf8_errc::InvalidCodePoint;
+            return new_next.second;
         }
 
         if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_))
@@ -261,7 +256,7 @@ namespace vcpkg::Unicode
             return utf8_errc::PairedSurrogates;
         }
 
-        next_ = new_next;
+        next_ = new_next.first;
         current_ = code_point;
         return utf8_errc::NoError;
     }
diff --git a/src/vcpkg/sourceparagraph.cpp b/src/vcpkg/sourceparagraph.cpp
index 525f58efbf..39320449f4 100644
--- a/src/vcpkg/sourceparagraph.cpp
+++ b/src/vcpkg/sourceparagraph.cpp
@@ -866,9 +866,11 @@ namespace vcpkg
             {
                 char32_t ch;
                 auto char_last = Unicode::utf8_decode_code_point(start, sv.end(), ch);
+                // unicode errors should be impossible, since the utf8 is correct by JSON construction
+                Checks::check_exit(VCPKG_LINE_INFO, char_last.second == Unicode::utf8_errc::NoError);
                 return msg::format(msgLicenseExpressionContainsUnicode,
                                    msg::value = ch,
-                                   msg::pretty_value = StringView{start, char_last});
+                                   msg::pretty_value = StringView{start, char_last.first});
             }
 
             return msg::format(msgLicenseExpressionContainsInvalidCharacter, msg::value = *start);