rust-lang · bors · Apr 1, 2020 · Mar 28, 2020 · Mar 28, 2020 · Mar 29, 2020
diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs
@@ -41,7 +41,7 @@ impl<'a> Cursor<'a> {
     /// If requested position doesn't exist, `EOF_CHAR` is returned.
     /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
     /// it should be checked with `is_eof` method.
-    fn nth_char(&self, n: usize) -> char {
+    pub(crate) fn nth_char(&self, n: usize) -> char {
         self.chars().nth(n).unwrap_or(EOF_CHAR)
     }
 

diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
@@ -17,9 +17,13 @@
 mod cursor;
 pub mod unescape;
 
+#[cfg(test)]
+mod tests;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
 use crate::cursor::{Cursor, EOF_CHAR};
+use std::convert::TryInto;
 
 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,65 @@ pub enum LiteralKind {
     /// "b"abc"", "b"abc"
     ByteStr { terminated: bool },
     /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
-    RawStr { n_hashes: usize, started: bool, terminated: bool },
+    RawStr(UnvalidatedRawStr),
     /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
-    RawByteStr { n_hashes: usize, started: bool, terminated: bool },
+    RawByteStr(UnvalidatedRawStr),
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct UnvalidatedRawStr {
+    valid_start: bool,
+    n_start_hashes: usize,
+    n_end_hashes: usize,
+    possible_terminator_offset: Option<usize>,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LexRawStrError {
+    /// Non # characters between `r` and `"` eg. `r#~"..`
+    InvalidStarter,
+    /// The string was never terminated. `possible_terminator_offset` is the best guess of where they
+    /// may have intended to terminate it.
+    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
+    /// More than 65536 # signs
+    TooManyDelimiters,
+}
+
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub struct ValidatedRawStr {
+    n_hashes: u16,
+}
+
+impl ValidatedRawStr {
+    pub fn num_hashes(&self) -> u16 {
+        self.n_hashes
+    }
+}
+
+impl UnvalidatedRawStr {
+    pub fn started(&self) -> bool {
+        self.valid_start
+    }
+
+    pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
+        if !self.valid_start {
+            return Err(LexRawStrError::InvalidStarter);
+        }
+
+        let n_start_safe: u16 =
+            self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
+        match (self.n_start_hashes, self.n_end_hashes) {
+            (n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator {
+                expected: n_start,
+                found: self.n_end_hashes,
+                possible_terminator_offset: self.possible_terminator_offset,
+            }),
+            (n_start, n_end) => {
+                debug_assert_eq!(n_start, n_end);
+                Ok(ValidatedRawStr { n_hashes: n_start_safe })
+            }
+        }
+    }
 }
 
 /// Base of numeric literal encoding according to its prefix.
@@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
         // Dedicated whitespace characters from Unicode
         | '\u{2028}' // LINE SEPARATOR
         | '\u{2029}' // PARAGRAPH SEPARATOR
-            => true,
+        => true,
         _ => false,
     }
 }
@@ -258,12 +318,12 @@ impl Cursor<'_> {
             'r' => match (self.first(), self.second()) {
                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
                 ('#', _) | ('"', _) => {
-                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let raw_str_i = self.raw_double_quoted_string(1);
                     let suffix_start = self.len_consumed();
-                    if terminated {
+                    if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
                         self.eat_literal_suffix();
                     }
-                    let kind = RawStr { n_hashes, started, terminated };
+                    let kind = RawStr(raw_str_i);
                     Literal { kind, suffix_start }
                 }
                 _ => self.ident(),
@@ -293,12 +353,14 @@ impl Cursor<'_> {
                 }
                 ('r', '"') | ('r', '#') => {
                     self.bump();
-                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let raw_str_i = self.raw_double_quoted_string(2);
                     let suffix_start = self.len_consumed();
+                    let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
                     if terminated {
                         self.eat_literal_suffix();
                     }
-                    let kind = RawByteStr { n_hashes, started, terminated };
+
+                    let kind = RawByteStr(raw_str_i);
                     Literal { kind, suffix_start }
                 }
                 _ => self.ident(),
@@ -594,37 +656,49 @@ impl Cursor<'_> {
         false
     }
 
-    /// Eats the double-quoted string and returns a tuple of
-    /// (amount of the '#' symbols, raw string started, raw string terminated)
-    fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
+    /// Eats the double-quoted string an UnvalidatedRawStr
+    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
         debug_assert!(self.prev() == 'r');
-        let mut started: bool = false;
-        let mut finished: bool = false;
+        let mut valid_start: bool = false;
+        let start_pos = self.len_consumed();
+        let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
 
         // Count opening '#' symbols.
-        let n_hashes = self.eat_while(|c| c == '#');
+        let n_start_hashes = self.eat_while(|c| c == '#');
 
         // Check that string is started.
         match self.bump() {
-            Some('"') => started = true,
-            _ => return (n_hashes, started, finished),
+            Some('"') => valid_start = true,
+            _ => {
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes: 0,
+                    possible_terminator_offset,
+                };
+            }
         }
 
         // Skip the string contents and on each '#' character met, check if this is
         // a raw string termination.
-        while !finished {
+        loop {
             self.eat_while(|c| c != '"');
 
             if self.is_eof() {
-                return (n_hashes, started, finished);
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes: max_hashes,
+                    possible_terminator_offset,
+                };
             }
 
             // Eat closing double quote.
             self.bump();
 
             // Check that amount of closing '#' symbols
             // is equal to the amount of opening ones.
-            let mut hashes_left = n_hashes;
+            let mut hashes_left = n_start_hashes;
             let is_closing_hash = |c| {
                 if c == '#' && hashes_left != 0 {
                     hashes_left -= 1;
@@ -633,10 +707,23 @@ impl Cursor<'_> {
                     false
                 }
             };
-            finished = self.eat_while(is_closing_hash) == n_hashes;
+            let n_end_hashes = self.eat_while(is_closing_hash);
+
+            if n_end_hashes == n_start_hashes {
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes,
+                    possible_terminator_offset: None,
+                };
+            } else if n_end_hashes > 0 && n_end_hashes > max_hashes {
+                // Keep track of possible terminators to give a hint about where there might be
+                // a missing terminator
+                possible_terminator_offset =
+                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+                max_hashes = n_end_hashes;
+            }
         }
-
-        (n_hashes, started, finished)
     }
 
     fn eat_decimal_digits(&mut self) -> bool {

diff --git a/src/librustc_lexer/src/tests.rs b/src/librustc_lexer/src/tests.rs
@@ -0,0 +1,119 @@
+#[cfg(test)]
+mod tests {
+    use crate::*;
+
+    fn check_raw_str(
+        s: &str,
+        expected: UnvalidatedRawStr,
+        validated: Result<ValidatedRawStr, LexRawStrError>,
+    ) {
+        let mut cursor = Cursor::new(s);
+        let tok = cursor.raw_double_quoted_string(0);
+        assert_eq!(tok, expected);
+        assert_eq!(tok.validate(), validated);
+    }
+
+    #[test]
+    fn test_naked_raw_str() {
+        check_raw_str(
+            r#""abc""#,
+            UnvalidatedRawStr {
+                n_start_hashes: 0,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 0 }),
+        );
+    }
+
+    #[test]
+    fn test_raw_no_start() {
+        check_raw_str(
+            r##""abc"#"##,
+            UnvalidatedRawStr {
+                n_start_hashes: 0,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 0 }),
+        );
+    }
+
+    #[test]
+    fn test_too_many_terminators() {
+        // this error is handled in the parser later
+        check_raw_str(
+            r###"#"abc"##"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 1,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 1 }),
+        );
+    }
+
+    #[test]
+    fn test_unterminated() {
+        check_raw_str(
+            r#"#"abc"#,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 1,
+                found: 0,
+                possible_terminator_offset: None,
+            }),
+        );
+        check_raw_str(
+            r###"##"abc"#"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 2,
+                n_end_hashes: 1,
+                valid_start: true,
+                possible_terminator_offset: Some(7),
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 2,
+                found: 1,
+                possible_terminator_offset: Some(7),
+            }),
+        );
+        // We're looking for "# not just any #
+        check_raw_str(
+            r###"##"abc#"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 2,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 2,
+                found: 0,
+                possible_terminator_offset: None,
+            }),
+        )
+    }
+
+    #[test]
+    fn test_invalid_start() {
+        check_raw_str(
+            r##"#~"abc"#"##,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 0,
+                valid_start: false,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::InvalidStarter),
+        );
+    }
+}