Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error messages for raw strings (#60762) #70522

Merged
merged 7 commits into from
Apr 1, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/librustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ impl<'a> Cursor<'a> {
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
fn nth_char(&self, n: usize) -> char {
pub(crate) fn nth_char(&self, n: usize) -> char {
Centril marked this conversation as resolved.
Show resolved Hide resolved
self.chars().nth(n).unwrap_or(EOF_CHAR)
}

Expand Down
131 changes: 109 additions & 22 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@
mod cursor;
pub mod unescape;

#[cfg(test)]
mod tests;

use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto;

/// Parsed token.
/// It doesn't contain information about data that has been parsed,
Expand Down Expand Up @@ -132,9 +136,65 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: usize, started: bool, terminated: bool },
RawStr(UnvalidatedRawStr),
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
RawByteStr(UnvalidatedRawStr),
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
Centril marked this conversation as resolved.
Show resolved Hide resolved
pub struct UnvalidatedRawStr {
valid_start: bool,
n_start_hashes: usize,
n_end_hashes: usize,
possible_terminator_offset: Option<usize>,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError {
/// Non # characters between `r` and `"` eg. `r#~"..`
Centril marked this conversation as resolved.
Show resolved Hide resolved
InvalidStarter,
/// The string was never terminated. `possible_terminator_offset` is the best guess of where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 # signs
TooManyDelimiters,
}

#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}

impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}

impl UnvalidatedRawStr {
pub fn started(&self) -> bool {
self.valid_start
}

pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}

let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
match (self.n_start_hashes, self.n_end_hashes) {
(n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator {
expected: n_start,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
}),
(n_start, n_end) => {
debug_assert_eq!(n_start, n_end);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
Centril marked this conversation as resolved.
Show resolved Hide resolved
}
}

/// Base of numeric literal encoding according to its prefix.
Expand Down Expand Up @@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
=> true,
_ => false,
}
}
Expand Down Expand Up @@ -258,12 +318,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if terminated {
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, started, terminated };
let kind = RawStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
Expand Down Expand Up @@ -293,12 +353,14 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
if terminated {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, started, terminated };

let kind = RawByteStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
Expand Down Expand Up @@ -594,37 +656,49 @@ impl Cursor<'_> {
false
}

/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
/// Eats the double-quoted string an UnvalidatedRawStr
Centril marked this conversation as resolved.
Show resolved Hide resolved
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
debug_assert!(self.prev() == 'r');
let mut started: bool = false;
let mut finished: bool = false;
let mut valid_start: bool = false;
let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);

// Count opening '#' symbols.
let n_hashes = self.eat_while(|c| c == '#');
let n_start_hashes = self.eat_while(|c| c == '#');

// Check that string is started.
match self.bump() {
Some('"') => started = true,
_ => return (n_hashes, started, finished),
Some('"') => valid_start = true,
_ => {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
}
}

// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
while !finished {
loop {
self.eat_while(|c| c != '"');

if self.is_eof() {
return (n_hashes, started, finished);
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: max_hashes,
possible_terminator_offset,
};
}

// Eat closing double quote.
self.bump();

// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
let mut hashes_left = n_hashes;
let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| {
if c == '#' && hashes_left != 0 {
hashes_left -= 1;
Expand All @@ -633,10 +707,23 @@ impl Cursor<'_> {
false
}
};
finished = self.eat_while(is_closing_hash) == n_hashes;
let n_end_hashes = self.eat_while(is_closing_hash);

if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
} else if n_end_hashes > 0 && n_end_hashes > max_hashes {
rcoh marked this conversation as resolved.
Show resolved Hide resolved
// Keep track of possible terminators to give a hint about where there might be
// a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}

(n_hashes, started, finished)
}

fn eat_decimal_digits(&mut self) -> bool {
Expand Down
119 changes: 119 additions & 0 deletions src/librustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#[cfg(test)]
mod tests {
use crate::*;

fn check_raw_str(
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
let mut cursor = Cursor::new(s);
let tok = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected);
assert_eq!(tok.validate(), validated);
}

#[test]
fn test_naked_raw_str() {
check_raw_str(
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_raw_no_start() {
check_raw_str(
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
}

#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 1,
found: 0,
possible_terminator_offset: None,
}),
);
check_raw_str(
r###"##"abc"#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
}),
);
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 0,
possible_terminator_offset: None,
}),
)
}

#[test]
fn test_invalid_start() {
check_raw_str(
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
}
}
Loading