From 52ed89ae8c264d8885bfda4f79033289db459c02 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Thu, 28 May 2020 10:31:38 +0200
Subject: [PATCH 1/5] from_u32_unchecked: check validity when debug assertions
 are enabled

---
 src/libcore/char/convert.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/libcore/char/convert.rs b/src/libcore/char/convert.rs
index 315020bac5850..87c56c4b0a105 100644
--- a/src/libcore/char/convert.rs
+++ b/src/libcore/char/convert.rs
@@ -99,7 +99,7 @@ pub fn from_u32(i: u32) -> Option<char> {
 #[inline]
 #[stable(feature = "char_from_unchecked", since = "1.5.0")]
 pub unsafe fn from_u32_unchecked(i: u32) -> char {
-    transmute(i)
+    if cfg!(debug_assertions) { char::from_u32(i).unwrap() } else { transmute(i) }
 }
 
 #[stable(feature = "char_convert", since = "1.13.0")]
@@ -218,7 +218,7 @@ impl TryFrom<u32> for char {
             Err(CharTryFromError(()))
         } else {
             // SAFETY: checked that it's a legal unicode value
-            Ok(unsafe { from_u32_unchecked(i) })
+            Ok(unsafe { transmute(i) })
         }
     }
 }

From 72d85db6eea29004a467842923c07169bc304217 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 30 May 2020 11:49:31 +0200
Subject: [PATCH 2/5] expose char::encode_utf8_raw for libstd

---
 src/libcore/char/methods.rs | 99 ++++++++++++++++++++++---------------
 src/libcore/char/mod.rs     |  4 ++
 2 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs
index 5c5bc9adb5df2..112e7e38e414b 100644
--- a/src/libcore/char/methods.rs
+++ b/src/libcore/char/methods.rs
@@ -593,16 +593,7 @@ impl char {
     #[stable(feature = "rust1", since = "1.0.0")]
     #[inline]
     pub fn len_utf8(self) -> usize {
-        let code = self as u32;
-        if code < MAX_ONE_B {
-            1
-        } else if code < MAX_TWO_B {
-            2
-        } else if code < MAX_THREE_B {
-            3
-        } else {
-            4
-        }
+        len_utf8(self as u32)
     }
 
     /// Returns the number of 16-bit code units this `char` would need if
@@ -670,36 +661,7 @@ impl char {
     #[stable(feature = "unicode_encode_char", since = "1.15.0")]
     #[inline]
     pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
-        let code = self as u32;
-        let len = self.len_utf8();
-        match (len, &mut dst[..]) {
-            (1, [a, ..]) => {
-                *a = code as u8;
-            }
-            (2, [a, b, ..]) => {
-                *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
-                *b = (code & 0x3F) as u8 | TAG_CONT;
-            }
-            (3, [a, b, c, ..]) => {
-                *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
-                *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-                *c = (code & 0x3F) as u8 | TAG_CONT;
-            }
-            (4, [a, b, c, d, ..]) => {
-                *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
-                *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-                *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-                *d = (code & 0x3F) as u8 | TAG_CONT;
-            }
-            _ => panic!(
-                "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
-                len,
-                code,
-                dst.len(),
-            ),
-        };
-        // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
-        unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
+        encode_utf8_raw(self as u32, dst)
     }
 
     /// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1673,3 +1635,60 @@ impl char {
         }
     }
 }
+
+#[inline]
+fn len_utf8(code: u32) -> usize {
+    if code < MAX_ONE_B {
+        1
+    } else if code < MAX_TWO_B {
+        2
+    } else if code < MAX_THREE_B {
+        3
+    } else {
+        4
+    }
+}
+
+/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
+/// and then returns the subslice of the buffer that contains the encoded character.
+///
+/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
+///
+/// # Panics
+///
+/// Panics if the buffer is not large enough.
+/// A buffer of length four is large enough to encode any `char`.
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+#[doc(hidden)]
+#[inline]
+pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
+    let len = len_utf8(code);
+    match (len, &mut dst[..]) {
+        (1, [a, ..]) => {
+            *a = code as u8;
+        }
+        (2, [a, b, ..]) => {
+            *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
+            *b = (code & 0x3F) as u8 | TAG_CONT;
+        }
+        (3, [a, b, c, ..]) => {
+            *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
+            *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+            *c = (code & 0x3F) as u8 | TAG_CONT;
+        }
+        (4, [a, b, c, d, ..]) => {
+            *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
+            *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
+            *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+            *d = (code & 0x3F) as u8 | TAG_CONT;
+        }
+        _ => panic!(
+            "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
+            len,
+            code,
+            dst.len(),
+        ),
+    };
+    // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
+    unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
+}
diff --git a/src/libcore/char/mod.rs b/src/libcore/char/mod.rs
index bf65c31e13597..40b429b749668 100644
--- a/src/libcore/char/mod.rs
+++ b/src/libcore/char/mod.rs
@@ -37,6 +37,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
 #[stable(feature = "unicode_version", since = "1.45.0")]
 pub use crate::unicode::UNICODE_VERSION;
 
+// perma-unstable re-exports
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+pub use self::methods::encode_utf8_raw;
+
 use crate::fmt::{self, Write};
 use crate::iter::FusedIterator;
 

From 3182cdf9baf8ed9e8ae24f4742ee5d3d01c2b54a Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 30 May 2020 11:53:50 +0200
Subject: [PATCH 3/5] wtf8: use encode_utf8_raw

---
 src/libstd/lib.rs             | 1 +
 src/libstd/sys_common/wtf8.rs | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs
index cc3e613fa3d60..3e3c1fd9026bc 100644
--- a/src/libstd/lib.rs
+++ b/src/libstd/lib.rs
@@ -247,6 +247,7 @@
 #![feature(cfg_target_has_atomic)]
 #![feature(cfg_target_thread_local)]
 #![feature(char_error_internals)]
+#![feature(char_internals)]
 #![feature(clamp)]
 #![feature(concat_idents)]
 #![feature(const_cstr_unchecked)]
diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs
index a98407da44850..90bbf4afd1a32 100644
--- a/src/libstd/sys_common/wtf8.rs
+++ b/src/libstd/sys_common/wtf8.rs
@@ -201,9 +201,8 @@ impl Wtf8Buf {
     /// Copied from String::push
     /// This does **not** include the WTF-8 concatenation check.
     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
-        let c = unsafe { char::from_u32_unchecked(code_point.value) };
         let mut bytes = [0; 4];
-        let bytes = c.encode_utf8(&mut bytes).as_bytes();
+        let bytes = char::encode_utf8_raw(code_point.value, &mut bytes).as_bytes();
         self.bytes.extend_from_slice(bytes)
     }
 

From 9c627c33dde998cfe42bcde07e1c5692370daf63 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 30 May 2020 12:08:55 +0200
Subject: [PATCH 4/5] also expose and use encode_utf16_raw for wtf8

---
 src/libcore/char/methods.rs   | 59 ++++++++++++++++++++++-------------
 src/libcore/char/mod.rs       |  2 ++
 src/libstd/sys_common/wtf8.rs |  3 +-
 3 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs
index 112e7e38e414b..b1b3c70efb1c7 100644
--- a/src/libcore/char/methods.rs
+++ b/src/libcore/char/methods.rs
@@ -701,28 +701,7 @@ impl char {
     #[stable(feature = "unicode_encode_char", since = "1.15.0")]
     #[inline]
     pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
-        let mut code = self as u32;
-        // SAFETY: each arm checks whether there are enough bits to write into
-        unsafe {
-            if (code & 0xFFFF) == code && !dst.is_empty() {
-                // The BMP falls through (assuming non-surrogate, as it should)
-                *dst.get_unchecked_mut(0) = code as u16;
-                slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
-            } else if dst.len() >= 2 {
-                // Supplementary planes break into surrogates.
-                code -= 0x1_0000;
-                *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
-                *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
-                slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
-            } else {
-                panic!(
-                    "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
-                    from_u32_unchecked(code).len_utf16(),
-                    code,
-                    dst.len(),
-                )
-            }
-        }
+        encode_utf16_raw(self as u32, dst)
     }
 
     /// Returns `true` if this `char` has the `Alphabetic` property.
@@ -1692,3 +1671,39 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
     // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
     unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
 }
+
+/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
+/// and then returns the subslice of the buffer that contains the encoded character.
+///
+/// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range.
+///
+/// # Panics
+///
+/// Panics if the buffer is not large enough.
+/// A buffer of length 2 is large enough to encode any `char`.
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+#[doc(hidden)]
+#[inline]
+pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
+    // SAFETY: each arm checks whether there are enough bits to write into
+    unsafe {
+        if (code & 0xFFFF) == code && !dst.is_empty() {
+            // The BMP falls through (assuming non-surrogate, as it should)
+            *dst.get_unchecked_mut(0) = code as u16;
+            slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
+        } else if dst.len() >= 2 {
+            // Supplementary planes break into surrogates.
+            code -= 0x1_0000;
+            *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
+            *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
+            slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
+        } else {
+            panic!(
+                "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
+                from_u32_unchecked(code).len_utf16(),
+                code,
+                dst.len(),
+            )
+        }
+    }
+}
diff --git a/src/libcore/char/mod.rs b/src/libcore/char/mod.rs
index 40b429b749668..1b4e906e4e475 100644
--- a/src/libcore/char/mod.rs
+++ b/src/libcore/char/mod.rs
@@ -39,6 +39,8 @@ pub use crate::unicode::UNICODE_VERSION;
 
 // perma-unstable re-exports
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+pub use self::methods::encode_utf16_raw;
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf8_raw;
 
 use crate::fmt::{self, Write};
diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs
index 90bbf4afd1a32..9f589c93ae59c 100644
--- a/src/libstd/sys_common/wtf8.rs
+++ b/src/libstd/sys_common/wtf8.rs
@@ -828,8 +828,7 @@ impl<'a> Iterator for EncodeWide<'a> {
 
         let mut buf = [0; 2];
         self.code_points.next().map(|code_point| {
-            let c = unsafe { char::from_u32_unchecked(code_point.value) };
-            let n = c.encode_utf16(&mut buf).len();
+            let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
             if n == 2 {
                 self.extra = buf[1];
             }

From 0fb6e63c0438ace4ad9d496376af955c0baacf04 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 30 May 2020 17:13:07 +0200
Subject: [PATCH 5/5] encode_utf8_raw is not always valid UTF-8; clarify
 comments

---
 src/libcore/char/methods.rs   | 19 ++++++++++++-------
 src/libstd/sys_common/wtf8.rs |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs
index b1b3c70efb1c7..bf09b28ff693e 100644
--- a/src/libcore/char/methods.rs
+++ b/src/libcore/char/methods.rs
@@ -661,7 +661,8 @@ impl char {
     #[stable(feature = "unicode_encode_char", since = "1.15.0")]
     #[inline]
     pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
-        encode_utf8_raw(self as u32, dst)
+        // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
+        unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) }
     }
 
     /// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1631,7 +1632,11 @@ fn len_utf8(code: u32) -> usize {
 /// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
 /// and then returns the subslice of the buffer that contains the encoded character.
 ///
-/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
+/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
+/// (Creating a `char` in the surrogate range is UB.)
+/// The result is valid [generalized UTF-8] but not valid UTF-8.
+///
+/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
 ///
 /// # Panics
 ///
@@ -1640,7 +1645,7 @@ fn len_utf8(code: u32) -> usize {
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 #[doc(hidden)]
 #[inline]
-pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
+pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
     let len = len_utf8(code);
     match (len, &mut dst[..]) {
         (1, [a, ..]) => {
@@ -1668,14 +1673,14 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
             dst.len(),
         ),
     };
-    // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
-    unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
+    &mut dst[..len]
 }
 
 /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
 /// and then returns the subslice of the buffer that contains the encoded character.
 ///
-/// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range.
+/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
+/// (Creating a `char` in the surrogate range is UB.)
 ///
 /// # Panics
 ///
@@ -1688,7 +1693,7 @@ pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
     // SAFETY: each arm checks whether there are enough bits to write into
     unsafe {
         if (code & 0xFFFF) == code && !dst.is_empty() {
-            // The BMP falls through (assuming non-surrogate, as it should)
+            // The BMP falls through
             *dst.get_unchecked_mut(0) = code as u16;
             slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
         } else if dst.len() >= 2 {
diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs
index 9f589c93ae59c..ccb54b7e68d18 100644
--- a/src/libstd/sys_common/wtf8.rs
+++ b/src/libstd/sys_common/wtf8.rs
@@ -202,7 +202,7 @@ impl Wtf8Buf {
     /// This does **not** include the WTF-8 concatenation check.
     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
         let mut bytes = [0; 4];
-        let bytes = char::encode_utf8_raw(code_point.value, &mut bytes).as_bytes();
+        let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
         self.bytes.extend_from_slice(bytes)
     }