Skip to content

Commit

Permalink
XXX: delay unescaping
Browse files Browse the repository at this point in the history
Note that from_token_lit was looking for errors but never finding them!

- issue-62913.rs: The structure and output changed a bit. Issue rust-lang#62913
  was about an ICE due to an unterminated string literal, so the new
  version should be good enough.
- literals-are-validated-before-expansion.rs: this tests exactly the
  behaviour that has been changed.
  XXX: insert a new test covering more of that
  • Loading branch information
nnethercote committed Dec 7, 2023
1 parent 3469136 commit a7e1ee3
Show file tree
Hide file tree
Showing 16 changed files with 135 additions and 124 deletions.
98 changes: 62 additions & 36 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token};
use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
Mode,
byte_from_char, unescape_c_string, unescape_literal, CStrUnit, EscapeError, Mode,
};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;
use std::ops::Range;
use std::{ascii, fmt, str};

// Escapes a string, represented as a symbol. Reuses the original symbol,
Expand All @@ -32,7 +32,9 @@ pub fn escape_byte_str_symbol(bytes: &[u8]) -> Symbol {

#[derive(Debug)]
pub enum LitError {
LexerError,
LexerError, // njn: can I get rid of this?
// The range is the byte range of the bad character, using a zero index.
EscapeError(Mode, Range<usize>, EscapeError),
InvalidSuffix,
InvalidIntSuffix,
InvalidFloatSuffix,
Expand All @@ -54,14 +56,36 @@ impl LitKind {
LitKind::Bool(symbol == kw::True)
}
token::Byte => {
return unescape_byte(symbol.as_str())
.map(LitKind::Byte)
.map_err(|_| LitError::LexerError);
let mut res = None;
let mut error = Ok(());
unescape_literal(symbol.as_str(), Mode::Byte, &mut |range, unescaped_char| {
match unescaped_char {
Ok(c) => res = Some(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::EscapeError(Mode::Byte, range, err));
}
}
}
});
error?;
LitKind::Byte(byte_from_char(res.unwrap()))
}
token::Char => {
return unescape_char(symbol.as_str())
.map(LitKind::Char)
.map_err(|_| LitError::LexerError);
let mut res = None;
let mut error = Ok(());
unescape_literal(symbol.as_str(), Mode::Char, &mut |range, unescaped_char| {
match unescaped_char {
Ok(c) => res = Some(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::EscapeError(Mode::Char, range, err));
}
}
}
});
error?;
LitKind::Char(res.unwrap())
}

// There are some valid suffixes for integer and float literals,
Expand All @@ -85,11 +109,12 @@ impl LitKind {
s,
Mode::Str,
&mut #[inline(always)]
|_, unescaped_char| match unescaped_char {
|range, unescaped_char| match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => {
// njn: what about warnings?
if err.is_fatal() {
error = Err(LitError::LexerError);
error = Err(LitError::EscapeError(Mode::Str, range, err));
}
}
},
Expand All @@ -104,36 +129,37 @@ impl LitKind {
token::StrRaw(n) => {
// Ditto.
let s = symbol.as_str();
let symbol =
if s.contains('\r') {
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
let symbol = if s.contains('\r') {
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::RawStr, &mut |range, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::EscapeError(Mode::RawStr, range, err));
}
}
});
error?;
Symbol::intern(&buf)
} else {
symbol
};
}
});
error?;
Symbol::intern(&buf)
} else {
symbol
};
LitKind::Str(symbol, ast::StrStyle::Raw(n))
}
token::ByteStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
// njn: precursor: make this conditional? Likewise for
// ByteStrRaw, CStr, CStrRaw?
unescape_literal(s, Mode::ByteStr, &mut |range, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
error = Err(LitError::EscapeError(Mode::ByteStr, range, err));
}
}
});
Expand All @@ -145,11 +171,11 @@ impl LitKind {
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
unescape_literal(s, Mode::RawByteStr, &mut |range, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
error = Err(LitError::EscapeError(Mode::RawByteStr, range, err));
}
}
});
Expand All @@ -165,15 +191,15 @@ impl LitKind {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
unescape_c_string(s, Mode::CStr, &mut |range, c| match c {
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
error = Err(LitError::EscapeError(Mode::CStr, range, err));
}
}
});
Expand All @@ -185,15 +211,15 @@ impl LitKind {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::RawCStr, &mut |_span, c| match c {
unescape_c_string(s, Mode::RawCStr, &mut |range, c| match c {
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
error = Err(LitError::EscapeError(Mode::RawCStr, range, err));
}
}
});
Expand Down
11 changes: 11 additions & 0 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,17 @@ impl Mode {
CStr | RawCStr => "c",
}
}

// The number of bytes before the content starts. E.g. for `RawStr` it is 2
// due to the leading `r"`.
pub fn prefix_len(self) -> usize {
// njn: reorder?
match self {
Char | Str => 1,
Byte | ByteStr | CStr | RawStr => 2,
RawByteStr | RawCStr => 3,
}
}
}

fn scan_escape<T: From<u8> + From<char>>(
Expand Down
44 changes: 22 additions & 22 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use rustc_span::{edition::Edition, BytePos, Pos, Span};

mod diagnostics;
mod tokentrees;
mod unescape_error_reporting;
pub(crate) mod unescape_error_reporting;
mod unicode_chars;

use unescape_error_reporting::{emit_unescape_error, escaped_char};
use unescape_error_reporting::escaped_char;

// This type is used a lot. Make sure it doesn't unintentionally get bigger.
//
Expand Down Expand Up @@ -696,47 +696,47 @@ impl<'a> StringReader<'a> {
fn cook_common(
&self,
kind: token::LitKind,
mode: Mode,
mode: Mode, // njn: remove
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
) -> (token::LitKind, Symbol) {
let mut has_fatal_err = false;
let content_start = start + BytePos(prefix_len);
let content_end = end - BytePos(postfix_len);
let lit_content = self.str_from_to(content_start, content_end);
#[allow(unused)]
unescape(lit_content, mode, &mut |range, result| {
// Here we only check for errors. The actual unescaping is done later.
// njn: temp for comparison, remove eventually
if let Err(err) = result {
// `span` is substring expressed as a span
// `range` is substring expressed as indices
let span_with_quotes = self.mk_sp(start, end);
let (start, end) = (range.start as u32, range.end as u32);
let lo = content_start + BytePos(start);
let hi = lo + BytePos(end - start);
let span = self.mk_sp(lo, hi);
if err.is_fatal() {
has_fatal_err = true;
//has_fatal_err = true;
}
emit_unescape_error(
&self.sess.span_diagnostic,
lit_content,
span_with_quotes,
span,
mode,
range,
err,
);
// eprintln!(
// "earl_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}, {:?}",
// lit_content, span_with_quotes, span, mode, range, err
// );
// crate::lexer::unescape_error_reporting::emit_unescape_error(
// &self.sess.span_diagnostic,
// lit_content,
// span_with_quotes,
// span,
// mode,
// range,
// err,
// );
}
});

// We normally exclude the quotes for the symbol, but for errors we
// include it because it results in clearer error messages.
if !has_fatal_err {
(kind, Symbol::intern(lit_content))
} else {
(token::Err, self.symbol_from_to(start, end))
}
(kind, Symbol::intern(lit_content))
}

fn cook_quoted(
Expand Down
6 changes: 3 additions & 3 deletions compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ pub(crate) fn emit_unescape_error(
lit: &str,
// full span of the literal, including quotes
span_with_quotes: Span,
// interior span of the literal, without quotes
// interior span of the literal, without quotes // njn: is that wrong?
span: Span,
mode: Mode,
// range of the error inside `lit`
range: Range<usize>,
error: EscapeError,
) {
debug!(
"emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
lit, span_with_quotes, mode, range, error
"emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}, {:?}",
lit, span_with_quotes, span, mode, range, error
);
let last_char = || {
let c = lit[range.clone()].chars().next_back().unwrap();
Expand Down
17 changes: 17 additions & 0 deletions compiler/rustc_parse/src/parser/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use super::{
};

use crate::errors;
use crate::lexer::unescape_error_reporting::emit_unescape_error;
use crate::maybe_recover_from_interpolated_ty_qpath;
use ast::mut_visit::{noop_visit_expr, MutVisitor};
use ast::{CoroutineKind, GenBlockKind, Pat, Path, PathSegment};
Expand Down Expand Up @@ -3651,6 +3652,22 @@ pub fn report_lit_error(sess: &ParseSess, err: LitError, lit: token::Lit, span:
// `LexerError` is an error, but it was already reported
// by lexer, so here we don't report it the second time.
LitError::LexerError => {}
LitError::EscapeError(mode, range, err) => {
// njn: move this computation into emit_unescape_error?
let (start, end) = (range.start as u32, range.end as u32);
let lo = span.lo() + BytePos(mode.prefix_len() as u32) + BytePos(start);
let hi = lo + BytePos(end - start);
let char_span = span.with_lo(lo).with_hi(hi);
emit_unescape_error(
&sess.span_diagnostic,
symbol.as_str(),
span,
char_span,
mode,
range,
err,
);
}
LitError::InvalidSuffix => {
if let Some(suffix) = suffix {
sess.emit_err(errors::InvalidLiteralSuffix { span, kind: kind.descr(), suffix });
Expand Down
12 changes: 6 additions & 6 deletions tests/ui/fmt/format-string-error-2.stderr
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
error: incorrect unicode escape sequence
--> $DIR/format-string-error-2.rs:77:20
|
LL | println!("\x7B}\u8 {", 1);
| ^^^ help: format of unicode escape sequences uses braces: `\u{8}`

error: invalid format string: expected `'}'`, found `'a'`
--> $DIR/format-string-error-2.rs:5:5
|
Expand Down Expand Up @@ -155,6 +149,12 @@ LL | println!("\x7B}\u{8} {", 1);
|
= note: if you intended to print `{`, you can escape it using `{{`

error: incorrect unicode escape sequence
--> $DIR/format-string-error-2.rs:77:20
|
LL | println!("\x7B}\u8 {", 1);
| ^^^ help: format of unicode escape sequences uses braces: `\u{8}`

error: invalid format string: unmatched `}` found
--> $DIR/format-string-error-2.rs:81:21
|
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/lexer/lex-bad-char-literals-7.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fn main() {
fn main() { // njn: need to split the unterminated char literal out
let _: char = '';
//~^ ERROR: empty character literal
let _: char = '\u{}';
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/byte-literals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ pub fn main() {
b' '; //~ ERROR byte constant must be escaped
b'''; //~ ERROR byte constant must be escaped
b'é'; //~ ERROR non-ASCII character in byte literal
b'a //~ ERROR unterminated byte constant [E0763]
b'a //~ ERROR unterminated byte constant [E0763] // njn: split out
}
1 change: 1 addition & 0 deletions tests/ui/parser/byte-string-literals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pub fn main() {
b"é"; //~ ERROR non-ASCII character in byte string literal
br##"é"##; //~ ERROR non-ASCII character in raw byte string literal
b"a //~ ERROR unterminated double quote byte string
// njn: split out
}
2 changes: 1 addition & 1 deletion tests/ui/parser/issues/issue-23620-invalid-escapes.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fn main() {
fn main() { // njn: not sure about this one
let _ = b"\u{a66e}";
//~^ ERROR unicode escape in byte string

Expand Down
8 changes: 4 additions & 4 deletions tests/ui/parser/issues/issue-62913.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"\u\\"
//~^ ERROR incorrect unicode escape sequence
//~| ERROR invalid trailing slash in literal
//~| ERROR expected item, found `"\u\"`
fn main() {
_ = "\u\\";
//~^ ERROR invalid trailing slash in literal
}
Loading

0 comments on commit a7e1ee3

Please sign in to comment.