XXX: delay unescaping

Note that from_token_lit was looking for errors but never finding them! - issue-62913.rs: The structure and output changed a bit. Issue rust-lang#62913 was about an ICE due to an unterminated string literal, so the new version should be good enough. - literals-are-validated-before-expansion.rs: this tests exactly the behaviour that has been changed. XXX: insert a new test covering more of that
nnethercote · Dec 7, 2023 · a7e1ee3 · a7e1ee3
1 parent 3469136
commit a7e1ee3
Show file tree

Hide file tree

Showing 16 changed files with 135 additions and 124 deletions.
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
@@ -3,11 +3,11 @@
 use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
 use crate::token::{self, Token};
 use rustc_lexer::unescape::{
-    byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
-    Mode,
+    byte_from_char, unescape_c_string, unescape_literal, CStrUnit, EscapeError, Mode,
 };
 use rustc_span::symbol::{kw, sym, Symbol};
 use rustc_span::Span;
+use std::ops::Range;
 use std::{ascii, fmt, str};
 
 // Escapes a string, represented as a symbol. Reuses the original symbol,
@@ -32,7 +32,9 @@ pub fn escape_byte_str_symbol(bytes: &[u8]) -> Symbol {
 
 #[derive(Debug)]
 pub enum LitError {
-    LexerError,
+    LexerError, // njn: can I get rid of this?
+    // The range is the byte range of the bad character, using a zero index.
+    EscapeError(Mode, Range<usize>, EscapeError),
     InvalidSuffix,
     InvalidIntSuffix,
     InvalidFloatSuffix,
@@ -54,14 +56,36 @@ impl LitKind {
                 LitKind::Bool(symbol == kw::True)
             }
             token::Byte => {
-                return unescape_byte(symbol.as_str())
-                    .map(LitKind::Byte)
-                    .map_err(|_| LitError::LexerError);
+                let mut res = None;
+                let mut error = Ok(());
+                unescape_literal(symbol.as_str(), Mode::Byte, &mut |range, unescaped_char| {
+                    match unescaped_char {
+                        Ok(c) => res = Some(c),
+                        Err(err) => {
+                            if err.is_fatal() {
+                                error = Err(LitError::EscapeError(Mode::Byte, range, err));
+                            }
+                        }
+                    }
+                });
+                error?;
+                LitKind::Byte(byte_from_char(res.unwrap()))
             }
             token::Char => {
-                return unescape_char(symbol.as_str())
-                    .map(LitKind::Char)
-                    .map_err(|_| LitError::LexerError);
+                let mut res = None;
+                let mut error = Ok(());
+                unescape_literal(symbol.as_str(), Mode::Char, &mut |range, unescaped_char| {
+                    match unescaped_char {
+                        Ok(c) => res = Some(c),
+                        Err(err) => {
+                            if err.is_fatal() {
+                                error = Err(LitError::EscapeError(Mode::Char, range, err));
+                            }
+                        }
+                    }
+                });
+                error?;
+                LitKind::Char(res.unwrap())
             }
 
             // There are some valid suffixes for integer and float literals,
@@ -85,11 +109,12 @@ impl LitKind {
                         s,
                         Mode::Str,
                         &mut #[inline(always)]
-                        |_, unescaped_char| match unescaped_char {
+                        |range, unescaped_char| match unescaped_char {
                             Ok(c) => buf.push(c),
                             Err(err) => {
+                                // njn: what about warnings?
                                 if err.is_fatal() {
-                                    error = Err(LitError::LexerError);
+                                    error = Err(LitError::EscapeError(Mode::Str, range, err));
                                 }
                             }
                         },
@@ -104,36 +129,37 @@ impl LitKind {
             token::StrRaw(n) => {
                 // Ditto.
                 let s = symbol.as_str();
-                let symbol =
-                    if s.contains('\r') {
-                        let mut buf = String::with_capacity(s.len());
-                        let mut error = Ok(());
-                        unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
-                            match unescaped_char {
-                                Ok(c) => buf.push(c),
-                                Err(err) => {
-                                    if err.is_fatal() {
-                                        error = Err(LitError::LexerError);
-                                    }
+                let symbol = if s.contains('\r') {
+                    let mut buf = String::with_capacity(s.len());
+                    let mut error = Ok(());
+                    unescape_literal(s, Mode::RawStr, &mut |range, unescaped_char| {
+                        match unescaped_char {
+                            Ok(c) => buf.push(c),
+                            Err(err) => {
+                                if err.is_fatal() {
+                                    error = Err(LitError::EscapeError(Mode::RawStr, range, err));
                                 }
                             }
-                        });
-                        error?;
-                        Symbol::intern(&buf)
-                    } else {
-                        symbol
-                    };
+                        }
+                    });
+                    error?;
+                    Symbol::intern(&buf)
+                } else {
+                    symbol
+                };
                 LitKind::Str(symbol, ast::StrStyle::Raw(n))
             }
             token::ByteStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
                 let mut error = Ok(());
-                unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
+                // njn: precursor: make this conditional? Likewise for
+                //      ByteStrRaw, CStr, CStrRaw?
+                unescape_literal(s, Mode::ByteStr, &mut |range, c| match c {
                     Ok(c) => buf.push(byte_from_char(c)),
                     Err(err) => {
                         if err.is_fatal() {
-                            error = Err(LitError::LexerError);
+                            error = Err(LitError::EscapeError(Mode::ByteStr, range, err));
                         }
                     }
                 });
@@ -145,11 +171,11 @@ impl LitKind {
                 let bytes = if s.contains('\r') {
                     let mut buf = Vec::with_capacity(s.len());
                     let mut error = Ok(());
-                    unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
+                    unescape_literal(s, Mode::RawByteStr, &mut |range, c| match c {
                         Ok(c) => buf.push(byte_from_char(c)),
                         Err(err) => {
                             if err.is_fatal() {
-                                error = Err(LitError::LexerError);
+                                error = Err(LitError::EscapeError(Mode::RawByteStr, range, err));
                             }
                         }
                     });
@@ -165,15 +191,15 @@ impl LitKind {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
                 let mut error = Ok(());
-                unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
+                unescape_c_string(s, Mode::CStr, &mut |range, c| match c {
                     Ok(CStrUnit::Byte(b)) => buf.push(b),
                     Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
                     Ok(CStrUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
                     Err(err) => {
                         if err.is_fatal() {
-                            error = Err(LitError::LexerError);
+                            error = Err(LitError::EscapeError(Mode::CStr, range, err));
                         }
                     }
                 });
@@ -185,15 +211,15 @@ impl LitKind {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
                 let mut error = Ok(());
-                unescape_c_string(s, Mode::RawCStr, &mut |_span, c| match c {
+                unescape_c_string(s, Mode::RawCStr, &mut |range, c| match c {
                     Ok(CStrUnit::Byte(b)) => buf.push(b),
                     Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
                     Ok(CStrUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
                     Err(err) => {
                         if err.is_fatal() {
-                            error = Err(LitError::LexerError);
+                            error = Err(LitError::EscapeError(Mode::RawCStr, range, err));
                         }
                     }
                 });

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
@@ -212,6 +212,17 @@ impl Mode {
             CStr | RawCStr => "c",
         }
     }
+
+    // The number of bytes before the content starts. E.g. for `RawStr` it is 2
+    // due to the leading `r"`.
+    pub fn prefix_len(self) -> usize {
+        // njn: reorder?
+        match self {
+            Char | Str => 1,
+            Byte | ByteStr | CStr | RawStr => 2,
+            RawByteStr | RawCStr => 3,
+        }
+    }
 }
 
 fn scan_escape<T: From<u8> + From<char>>(

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -21,10 +21,10 @@ use rustc_span::{edition::Edition, BytePos, Pos, Span};
 
 mod diagnostics;
 mod tokentrees;
-mod unescape_error_reporting;
+pub(crate) mod unescape_error_reporting;
 mod unicode_chars;
 
-use unescape_error_reporting::{emit_unescape_error, escaped_char};
+use unescape_error_reporting::escaped_char;
 
 // This type is used a lot. Make sure it doesn't unintentionally get bigger.
 //
@@ -696,47 +696,47 @@ impl<'a> StringReader<'a> {
     fn cook_common(
         &self,
         kind: token::LitKind,
-        mode: Mode,
+        mode: Mode, // njn: remove
         start: BytePos,
         end: BytePos,
         prefix_len: u32,
         postfix_len: u32,
         unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
     ) -> (token::LitKind, Symbol) {
-        let mut has_fatal_err = false;
         let content_start = start + BytePos(prefix_len);
         let content_end = end - BytePos(postfix_len);
         let lit_content = self.str_from_to(content_start, content_end);
+        #[allow(unused)]
         unescape(lit_content, mode, &mut |range, result| {
             // Here we only check for errors. The actual unescaping is done later.
+            // njn: temp for comparison, remove eventually
             if let Err(err) = result {
+                // `span` is substring expressed as a span
+                // `range` is substring expressed as indices
                 let span_with_quotes = self.mk_sp(start, end);
                 let (start, end) = (range.start as u32, range.end as u32);
                 let lo = content_start + BytePos(start);
                 let hi = lo + BytePos(end - start);
                 let span = self.mk_sp(lo, hi);
                 if err.is_fatal() {
-                    has_fatal_err = true;
+                    //has_fatal_err = true;
                 }
-                emit_unescape_error(
-                    &self.sess.span_diagnostic,
-                    lit_content,
-                    span_with_quotes,
-                    span,
-                    mode,
-                    range,
-                    err,
-                );
+                // eprintln!(
+                //     "earl_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}, {:?}",
+                //     lit_content, span_with_quotes, span, mode, range, err
+                // );
+                // crate::lexer::unescape_error_reporting::emit_unescape_error(
+                //     &self.sess.span_diagnostic,
+                //     lit_content,
+                //     span_with_quotes,
+                //     span,
+                //     mode,
+                //     range,
+                //     err,
+                // );
             }
         });
-
-        // We normally exclude the quotes for the symbol, but for errors we
-        // include it because it results in clearer error messages.
-        if !has_fatal_err {
-            (kind, Symbol::intern(lit_content))
-        } else {
-            (token::Err, self.symbol_from_to(start, end))
-        }
+        (kind, Symbol::intern(lit_content))
     }
 
     fn cook_quoted(

diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@@ -15,16 +15,16 @@ pub(crate) fn emit_unescape_error(
     lit: &str,
     // full span of the literal, including quotes
     span_with_quotes: Span,
-    // interior span of the literal, without quotes
+    // interior span of the literal, without quotes // njn: is that wrong?
     span: Span,
     mode: Mode,
     // range of the error inside `lit`
     range: Range<usize>,
     error: EscapeError,
 ) {
     debug!(
-        "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
-        lit, span_with_quotes, mode, range, error
+        "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}, {:?}",
+        lit, span_with_quotes, span, mode, range, error
     );
     let last_char = || {
         let c = lit[range.clone()].chars().next_back().unwrap();

diff --git a/compiler/rustc_parse/src/parser/expr.rs b/compiler/rustc_parse/src/parser/expr.rs
@@ -8,6 +8,7 @@ use super::{
 };
 
 use crate::errors;
+use crate::lexer::unescape_error_reporting::emit_unescape_error;
 use crate::maybe_recover_from_interpolated_ty_qpath;
 use ast::mut_visit::{noop_visit_expr, MutVisitor};
 use ast::{CoroutineKind, GenBlockKind, Pat, Path, PathSegment};
@@ -3651,6 +3652,22 @@ pub fn report_lit_error(sess: &ParseSess, err: LitError, lit: token::Lit, span:
         // `LexerError` is an error, but it was already reported
         // by lexer, so here we don't report it the second time.
         LitError::LexerError => {}
+        LitError::EscapeError(mode, range, err) => {
+            // njn: move this computation into emit_unescape_error?
+            let (start, end) = (range.start as u32, range.end as u32);
+            let lo = span.lo() + BytePos(mode.prefix_len() as u32) + BytePos(start);
+            let hi = lo + BytePos(end - start);
+            let char_span = span.with_lo(lo).with_hi(hi);
+            emit_unescape_error(
+                &sess.span_diagnostic,
+                symbol.as_str(),
+                span,
+                char_span,
+                mode,
+                range,
+                err,
+            );
+        }
         LitError::InvalidSuffix => {
             if let Some(suffix) = suffix {
                 sess.emit_err(errors::InvalidLiteralSuffix { span, kind: kind.descr(), suffix });

diff --git a/tests/ui/fmt/format-string-error-2.stderr b/tests/ui/fmt/format-string-error-2.stderr
@@ -1,9 +1,3 @@
-error: incorrect unicode escape sequence
-  --> $DIR/format-string-error-2.rs:77:20
-   |
-LL |     println!("\x7B}\u8 {", 1);
-   |                    ^^^ help: format of unicode escape sequences uses braces: `\u{8}`
-
 error: invalid format string: expected `'}'`, found `'a'`
   --> $DIR/format-string-error-2.rs:5:5
    |
@@ -155,6 +149,12 @@ LL |     println!("\x7B}\u{8} {", 1);
    |
    = note: if you intended to print `{`, you can escape it using `{{`
 
+error: incorrect unicode escape sequence
+  --> $DIR/format-string-error-2.rs:77:20
+   |
+LL |     println!("\x7B}\u8 {", 1);
+   |                    ^^^ help: format of unicode escape sequences uses braces: `\u{8}`
+
 error: invalid format string: unmatched `}` found
   --> $DIR/format-string-error-2.rs:81:21
    |

diff --git a/tests/ui/lexer/lex-bad-char-literals-7.rs b/tests/ui/lexer/lex-bad-char-literals-7.rs
@@ -1,4 +1,4 @@
-fn main() {
+fn main() { // njn: need to split the unterminated char literal out
     let _: char = '';
     //~^ ERROR: empty character literal
     let _: char = '\u{}';

diff --git a/tests/ui/parser/byte-literals.rs b/tests/ui/parser/byte-literals.rs
@@ -8,5 +8,5 @@ pub fn main() {
     b'	';  //~ ERROR byte constant must be escaped
     b''';  //~ ERROR byte constant must be escaped
     b'é';  //~ ERROR non-ASCII character in byte literal
-    b'a  //~ ERROR unterminated byte constant [E0763]
+    b'a  //~ ERROR unterminated byte constant [E0763]   // njn: split out
 }
diff --git a/tests/ui/parser/byte-string-literals.rs b/tests/ui/parser/byte-string-literals.rs
@@ -6,4 +6,5 @@ pub fn main() {
     b"é";  //~ ERROR non-ASCII character in byte string literal
     br##"é"##;  //~ ERROR non-ASCII character in raw byte string literal
     b"a  //~ ERROR unterminated double quote byte string
+    // njn: split out
 }
diff --git a/tests/ui/parser/issues/issue-23620-invalid-escapes.rs b/tests/ui/parser/issues/issue-23620-invalid-escapes.rs
@@ -1,4 +1,4 @@
-fn main() {
+fn main() { // njn: not sure about this one
     let _ = b"\u{a66e}";
     //~^ ERROR unicode escape in byte string
 

diff --git a/tests/ui/parser/issues/issue-62913.rs b/tests/ui/parser/issues/issue-62913.rs
@@ -1,4 +1,4 @@
-"\u\\"
-//~^ ERROR incorrect unicode escape sequence
-//~| ERROR invalid trailing slash in literal
-//~| ERROR expected item, found `"\u\"`
+fn main() {
+    _ = "\u\\";
+    //~^ ERROR invalid trailing slash in literal
+}