From 677e6f3439b91769a9b54c18afe5c136c14d9e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pol=20Valletb=C3=B3?= Date: Wed, 11 Oct 2023 13:09:12 +0200 Subject: [PATCH 1/5] fix: use unescape_byte function for Byte literals --- crates/parser/src/lexed_str.rs | 2 +- .../lexer/err/byte_char_literals.rast | 28 ++++++++----------- .../test_data/lexer/err/byte_char_literals.rs | 3 -- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 8e8bdce1eef26..84cedc1fa3f0f 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -274,7 +274,7 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 2..][..len - 2]; let i = text.rfind('\'').unwrap(); let text = &text[..i]; - if let Err(e) = rustc_lexer::unescape::unescape_char(text) { + if let Err(e) = rustc_lexer::unescape::unescape_byte(text) { err = error_to_diagnostic_message(e, Mode::Byte); } } diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rast b/crates/parser/test_data/lexer/err/byte_char_literals.rast index 24892bc239486..7603c9099daad 100644 --- a/crates/parser/test_data/lexer/err/byte_char_literals.rast +++ b/crates/parser/test_data/lexer/err/byte_char_literals.rast @@ -22,9 +22,9 @@ BYTE "b'\\'a'" error: character literal may only contain one codepoint WHITESPACE "\n" BYTE "b'\\0a'" error: character literal may only contain one codepoint WHITESPACE "\n" -BYTE "b'\\u{0}x'" error: character literal may only contain one codepoint +BYTE "b'\\u{0}x'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{1F63b}}'" error: character literal may only contain one codepoint +BYTE "b'\\u{1F63b}}'" error: unicode escape in byte string WHITESPACE "\n" BYTE "b'\\v'" error: unknown byte escape WHITESPACE "\n" @@ -50,12 +50,6 @@ BYTE "b'\\x🦀'" error: invalid character in numeric character escape WHITESPACE "\n" BYTE "b'\\xtt'" error: invalid character in numeric character escape WHITESPACE "\n" -BYTE "b'\\xff'" error: out of range hex escape -WHITESPACE "\n" -BYTE "b'\\xFF'" error: out of range hex escape -WHITESPACE "\n" -BYTE "b'\\x80'" error: out of range hex escape -WHITESPACE "\n" BYTE "b'\\u'" error: incorrect unicode escape sequence WHITESPACE "\n" BYTE "b'\\u[0123]'" error: incorrect unicode escape sequence @@ -72,21 +66,21 @@ BYTE "b'\\u{_0000}'" error: invalid start of unicode escape WHITESPACE "\n" BYTE "b'\\u{0000000}'" error: overlong unicode escape WHITESPACE "\n" -BYTE "b'\\u{FFFFFF}'" error: invalid unicode character escape +BYTE "b'\\u{FFFFFF}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{ffffff}'" error: invalid unicode character escape +BYTE "b'\\u{ffffff}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{ffffff}'" error: invalid unicode character escape +BYTE "b'\\u{ffffff}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{DC00}'" error: invalid unicode character escape +BYTE "b'\\u{DC00}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{DDDD}'" error: invalid unicode character escape +BYTE "b'\\u{DDDD}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{DFFF}'" error: invalid unicode character escape +BYTE "b'\\u{DFFF}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{D800}'" error: invalid unicode character escape +BYTE "b'\\u{D800}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{DAAA}'" error: invalid unicode character escape +BYTE "b'\\u{DAAA}'" error: unicode escape in byte string WHITESPACE "\n" -BYTE "b'\\u{DBFF}'" error: invalid unicode character escape +BYTE "b'\\u{DBFF}'" error: unicode escape in byte string WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rs b/crates/parser/test_data/lexer/err/byte_char_literals.rs index 9f2f4309e7692..b2d06e490bd6f 100644 --- a/crates/parser/test_data/lexer/err/byte_char_literals.rs +++ b/crates/parser/test_data/lexer/err/byte_char_literals.rs @@ -25,9 +25,6 @@ b'\xx' b'\xы' b'\x🦀' b'\xtt' -b'\xff' -b'\xFF' -b'\x80' b'\u' b'\u[0123]' b'\u{0x}' From e1aeb7fa794e228ca9099ac7679e8a1d0b22238a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pol=20Valletb=C3=B3?= Date: Wed, 11 Oct 2023 15:25:52 +0200 Subject: [PATCH 2/5] fix: handle errors for string byte string and c_string --- crates/parser/src/lexed_str.rs | 42 ++++++++++++++++++- .../test_data/lexer/err/byte_strings.rast | 28 +++++++++++++ .../test_data/lexer/err/byte_strings.rs | 14 +++++++ .../parser/test_data/lexer/err/c_strings.rast | 28 +++++++++++++ .../parser/test_data/lexer/err/c_strings.rs | 14 +++++++ .../parser/test_data/lexer/err/strings.rast | 28 +++++++++++++ crates/parser/test_data/lexer/err/strings.rs | 14 +++++++ 7 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 crates/parser/test_data/lexer/err/byte_strings.rast create mode 100644 crates/parser/test_data/lexer/err/byte_strings.rs create mode 100644 crates/parser/test_data/lexer/err/c_strings.rast create mode 100644 crates/parser/test_data/lexer/err/c_strings.rs create mode 100644 crates/parser/test_data/lexer/err/strings.rast create mode 100644 crates/parser/test_data/lexer/err/strings.rs diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 84cedc1fa3f0f..c2e25daf37f6e 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -8,7 +8,10 @@ //! Note that these tokens, unlike the tokens we feed into the parser, do //! include info about comments and whitespace. -use rustc_dependencies::lexer as rustc_lexer; +use rustc_dependencies::lexer::{ + self as rustc_lexer, + unescape::{unescape_c_string, unescape_literal}, +}; use std::ops; @@ -284,18 +287,45 @@ impl<'a> Converter<'a> { rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { err = "Missing trailing `\"` symbol to terminate the string literal"; + } else { + let text = &self.res.text[self.offset + 1..][..len - 1]; + let i = text.rfind('"').unwrap(); + let text = &text[..i]; + rustc_lexer::unescape::unescape_literal(text, Mode::Str, &mut |_, res| { + if let Err(e) = res { + err = error_to_diagnostic_message(e, Mode::Str); + } + }); } STRING } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { err = "Missing trailing `\"` symbol to terminate the byte string literal"; + } else { + let text = &self.res.text[self.offset + 2..][..len - 2]; + let i = text.rfind('"').unwrap(); + let text = &text[..i]; + rustc_lexer::unescape::unescape_literal(text, Mode::ByteStr, &mut |_, res| { + if let Err(e) = res { + err = error_to_diagnostic_message(e, Mode::ByteStr); + } + }) } BYTE_STRING } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { err = "Missing trailing `\"` symbol to terminate the string literal"; + } else { + let text = &self.res.text[self.offset + 2..][..len - 2]; + let i = text.rfind('"').unwrap(); + let text = &text[..i]; + rustc_lexer::unescape::unescape_c_string(text, Mode::CStr, &mut |_, res| { + if let Err(e) = res { + err = error_to_diagnostic_message(e, Mode::CStr); + } + }) } C_STRING } @@ -360,3 +390,13 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { EscapeError::MultipleSkippedLinesWarning => "", } } + +fn fill_unescape_string_error(text: &str, mode: Mode, mut error_message: &str) { + + rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, mode); + } + }); +} + diff --git a/crates/parser/test_data/lexer/err/byte_strings.rast b/crates/parser/test_data/lexer/err/byte_strings.rast new file mode 100644 index 0000000000000..e8d8ff8cefb42 --- /dev/null +++ b/crates/parser/test_data/lexer/err/byte_strings.rast @@ -0,0 +1,28 @@ +BYTE_STRING "b\"\\💩\"" error: unknown byte escape +WHITESPACE "\n" +BYTE_STRING "b\"\\●\"" error: unknown byte escape +WHITESPACE "\n" +BYTE_STRING "b\"\\u{_0000}\"" error: invalid start of unicode escape +WHITESPACE "\n" +BYTE_STRING "b\"\\u{0000000}\"" error: overlong unicode escape +WHITESPACE "\n" +BYTE_STRING "b\"\\u{FFFFFF}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{ffffff}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{ffffff}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{DC00}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{DDDD}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{DFFF}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{D800}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{DAAA}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\u{DBFF}\"" error: unicode escape in byte string +WHITESPACE "\n" +BYTE_STRING "b\"\\xы\"" error: invalid character in numeric character escape +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/byte_strings.rs b/crates/parser/test_data/lexer/err/byte_strings.rs new file mode 100644 index 0000000000000..e74847137b1ea --- /dev/null +++ b/crates/parser/test_data/lexer/err/byte_strings.rs @@ -0,0 +1,14 @@ +b"\💩" +b"\●" +b"\u{_0000}" +b"\u{0000000}" +b"\u{FFFFFF}" +b"\u{ffffff}" +b"\u{ffffff}" +b"\u{DC00}" +b"\u{DDDD}" +b"\u{DFFF}" +b"\u{D800}" +b"\u{DAAA}" +b"\u{DBFF}" +b"\xы" diff --git a/crates/parser/test_data/lexer/err/c_strings.rast b/crates/parser/test_data/lexer/err/c_strings.rast new file mode 100644 index 0000000000000..1b4424ba5c781 --- /dev/null +++ b/crates/parser/test_data/lexer/err/c_strings.rast @@ -0,0 +1,28 @@ +C_STRING "c\"\\💩\"" error: unknown character escape +WHITESPACE "\n" +C_STRING "c\"\\●\"" error: unknown character escape +WHITESPACE "\n" +C_STRING "c\"\\u{_0000}\"" error: invalid start of unicode escape +WHITESPACE "\n" +C_STRING "c\"\\u{0000000}\"" error: overlong unicode escape +WHITESPACE "\n" +C_STRING "c\"\\u{FFFFFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{ffffff}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{ffffff}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{DC00}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{DDDD}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{DFFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{D800}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{DAAA}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\u{DBFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +C_STRING "c\"\\xы\"" error: invalid character in numeric character escape +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/c_strings.rs b/crates/parser/test_data/lexer/err/c_strings.rs new file mode 100644 index 0000000000000..1b78ffc28a00d --- /dev/null +++ b/crates/parser/test_data/lexer/err/c_strings.rs @@ -0,0 +1,14 @@ +c"\💩" +c"\●" +c"\u{_0000}" +c"\u{0000000}" +c"\u{FFFFFF}" +c"\u{ffffff}" +c"\u{ffffff}" +c"\u{DC00}" +c"\u{DDDD}" +c"\u{DFFF}" +c"\u{D800}" +c"\u{DAAA}" +c"\u{DBFF}" +c"\xы" diff --git a/crates/parser/test_data/lexer/err/strings.rast b/crates/parser/test_data/lexer/err/strings.rast new file mode 100644 index 0000000000000..0cd1747208e4d --- /dev/null +++ b/crates/parser/test_data/lexer/err/strings.rast @@ -0,0 +1,28 @@ +STRING "\"\\💩\"" error: unknown character escape +WHITESPACE "\n" +STRING "\"\\●\"" error: unknown character escape +WHITESPACE "\n" +STRING "\"\\u{_0000}\"" error: invalid start of unicode escape +WHITESPACE "\n" +STRING "\"\\u{0000000}\"" error: overlong unicode escape +WHITESPACE "\n" +STRING "\"\\u{FFFFFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{ffffff}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{ffffff}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{DC00}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{DDDD}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{DFFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{D800}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{DAAA}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\u{DBFF}\"" error: invalid unicode character escape +WHITESPACE "\n" +STRING "\"\\xы\"" error: invalid character in numeric character escape +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/strings.rs b/crates/parser/test_data/lexer/err/strings.rs new file mode 100644 index 0000000000000..2499516d3fa9f --- /dev/null +++ b/crates/parser/test_data/lexer/err/strings.rs @@ -0,0 +1,14 @@ +"\💩" +"\●" +"\u{_0000}" +"\u{0000000}" +"\u{FFFFFF}" +"\u{ffffff}" +"\u{ffffff}" +"\u{DC00}" +"\u{DDDD}" +"\u{DFFF}" +"\u{D800}" +"\u{DAAA}" +"\u{DBFF}" +"\xы" From b769f34f6371b13f7ce81cefe65579911331eb16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pol=20Valletb=C3=B3?= Date: Wed, 11 Oct 2023 15:45:45 +0200 Subject: [PATCH 3/5] chore: move common code to function --- crates/parser/src/lexed_str.rs | 46 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index c2e25daf37f6e..4d322f21ae70a 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -291,11 +291,7 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 1..][..len - 1]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - rustc_lexer::unescape::unescape_literal(text, Mode::Str, &mut |_, res| { - if let Err(e) = res { - err = error_to_diagnostic_message(e, Mode::Str); - } - }); + err = unescape_string_error_message(text, Mode::Str); } STRING } @@ -306,11 +302,7 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 2..][..len - 2]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - rustc_lexer::unescape::unescape_literal(text, Mode::ByteStr, &mut |_, res| { - if let Err(e) = res { - err = error_to_diagnostic_message(e, Mode::ByteStr); - } - }) + err = unescape_string_error_message(text, Mode::ByteStr); } BYTE_STRING } @@ -321,11 +313,7 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 2..][..len - 2]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - rustc_lexer::unescape::unescape_c_string(text, Mode::CStr, &mut |_, res| { - if let Err(e) = res { - err = error_to_diagnostic_message(e, Mode::CStr); - } - }) + err = unescape_string_error_message(text, Mode::CStr); } C_STRING } @@ -391,12 +379,26 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { } } -fn fill_unescape_string_error(text: &str, mode: Mode, mut error_message: &str) { - - rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| { - if let Err(e) = res { - error_message = error_to_diagnostic_message(e, mode); +fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { + let mut error_message = ""; + match mode { + Mode::CStr => { + rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, mode); + } + }); + } + Mode::ByteStr | Mode::Str => { + rustc_lexer::unescape::unescape_literal(text, mode, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, mode); + } + }); } - }); + _ => { + // Other Modes are not supported yet or do not apply + } + } + error_message } - From 4b281ffdf2c0315722729ec090f74c0d49feca1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pol=20Valletb=C3=B3?= Date: Wed, 11 Oct 2023 15:52:05 +0200 Subject: [PATCH 4/5] chore: format imports --- crates/parser/src/lexed_str.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 4d322f21ae70a..13189b8bd003d 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -7,11 +7,9 @@ //! //! Note that these tokens, unlike the tokens we feed into the parser, do //! include info about comments and whitespace. +// -use rustc_dependencies::lexer::{ - self as rustc_lexer, - unescape::{unescape_c_string, unescape_literal}, -}; +use rustc_dependencies::lexer as rustc_lexer; use std::ops; From 6845c80a2fd52e2d8c58bda0e55c39c4bb836ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pol=20Valletb=C3=B3?= Date: Wed, 11 Oct 2023 15:52:22 +0200 Subject: [PATCH 5/5] fix: format --- crates/parser/src/lexed_str.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 13189b8bd003d..b9e7566fdf9bc 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -7,7 +7,6 @@ //! //! Note that these tokens, unlike the tokens we feed into the parser, do //! include info about comments and whitespace. -// use rustc_dependencies::lexer as rustc_lexer;