diff --git a/src/read.rs b/src/read.rs index e9f3f58e7..6b5abdc22 100644 --- a/src/read.rs +++ b/src/read.rs @@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>( validate: bool, scratch: &mut Vec, ) -> Result<()> { - let c = match tri!(read.decode_hex_escape()) { - n @ 0xDC00..=0xDFFF => { - return if validate { - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) - } else { - push_wtf8_codepoint(n as u32, scratch); - Ok(()) - }; - } + let n = tri!(read.decode_hex_escape()); - // Non-BMP characters are encoded as a sequence of two hex - // escapes, representing UTF-16 surrogates. If deserializing a - // utf-8 string the surrogates are required to be paired, - // whereas deserializing a byte string accepts lone surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(peek_or_eof(read)) == b'\\' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - push_wtf8_codepoint(n1 as u32, scratch); - Ok(()) - }; - } + // Non-BMP characters are encoded as a sequence of two hex + // escapes, representing UTF-16 surrogates. If deserializing a + // utf-8 string the surrogates are required to be paired, + // whereas deserializing a byte string accepts lone surrogates. + if validate && n >= 0xDC00 && n <= 0xDFFF { + // XXX: This is actually a trailing surrogate. + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } - if tri!(peek_or_eof(read)) == b'u' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - push_wtf8_codepoint(n1 as u32, scratch); - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - parse_escape(read, validate, scratch) - }; - } + if n < 0xD800 || n > 0xDBFF { + // Every u16 outside of the surrogate ranges is guaranteed to be a + // legal char. + push_wtf8_codepoint(n as u32, scratch); + return Ok(()); + } - let n2 = tri!(read.decode_hex_escape()); + // n is a leading surrogate, we now expect a trailing surrogate. + let n1 = n; - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + if tri!(peek_or_eof(read)) == b'\\' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + Ok(()) + }; + } - // This value is in range U+10000..=U+10FFFF, which is always a - // valid codepoint. - (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000 - } + if tri!(peek_or_eof(read)) == b'u' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; + } - // Every u16 outside of the surrogate ranges above is guaranteed - // to be a legal char. - n => n as u32, - }; + let n2 = tri!(read.decode_hex_escape()); + + if n2 < 0xDC00 || n2 > 0xDFFF { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } - push_wtf8_codepoint(c, scratch); + // This value is in range U+10000..=U+10FFFF, which is always a + // valid codepoint. + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + push_wtf8_codepoint(n, scratch); Ok(()) }