gccrs: add utf-8 validation for input source

tamaroning · philberty · commit 46a61f02f12d · 2023-07-06T16:12:19.000Z
gcc/rust/ChangeLog:

	* lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`.
	* lex/rust-lex.h: Likewise.
	* rust-session-manager.cc (Session::compile_crate): Add error.

gcc/testsuite/ChangeLog:

	* rust/compile/broken_utf8.rs: New test.

Signed-off-by: Raiki Tamura &lt;tamaron1203@gmail.com&gt;
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
@@ -167,6 +167,12 @@ Lexer::~Lexer ()
   // line_map->stop();
 }
 
+bool
+Lexer::input_source_is_valid_utf8 ()
+{
+  return raw_input_source->is_valid ();
+}
+
 /* TODO: need to optimise somehow to avoid the virtual function call in the
  * tight loop. Best idea at the moment is CRTP, but that might make lexer
  * implementation annoying when storing the "base class" (i.e. would need
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
@@ -175,6 +175,8 @@ class Lexer
   Lexer (Lexer &&other) = default;
   Lexer &operator= (Lexer &&other) = default;
 
+  bool input_source_is_valid_utf8 ();
+
   // Returns token n tokens ahead of current position.
   const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
   // Peeks the current token.
@@ -217,9 +219,9 @@ class Lexer
 
     Codepoint next_codepoint ()
     {
-      uint8_t input = next_byte ();
+      uint32_t input = next_byte ();
 
-      if ((int8_t) input == EOF)
+      if ((int32_t) input == EOF)
 	return Codepoint::eof ();
       else if (input < 128)
 	{
@@ -246,11 +248,13 @@ class Lexer
 	  // 3 bytes or UTF-8 BOM
 	  uint8_t input2 = next_byte ();
 	  // If the second byte is equal to 0xBB then the input is no longer a
-	  // valid UTF-8 char.
+	  // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+	  // BOM.
 	  if (input == 0xEF && input2 == 0xBB)
 	    {
 	      uint8_t input3 = next_byte ();
 	      if (input3 == 0xBF)
+		// found BOM
 		return next_codepoint ();
 	      else
 		return {0xFFFE};
@@ -289,8 +293,6 @@ class Lexer
 	}
       else
 	{
-	  // rust_error_at (get_current_location (),
-	  //   "invalid UTF-8 [SECND] (too long)");
 	  return {0xFFFE};
 	}
     }
@@ -362,8 +364,7 @@ class Lexer
     {
       if (offs >= buffer.size ())
 	return EOF;
-
-      return buffer.at (offs++);
+      return (uint8_t) buffer.at (offs++);
     }
 
   public:
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
@@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)
 
   Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);
 
+  if (!lex.input_source_is_valid_utf8 ())
+    {
+      rust_error_at (Linemap::unknown_location (),
+		     "cannot read %s; stream did not contain valid UTF-8",
+		     filename);
+      return;
+    }
+
   Parser<Lexer> parser (lex);
 
   // generate crate from parser
diff --git a/gcc/testsuite/rust/compile/broken_utf8.rs b/gcc/testsuite/rust/compile/broken_utf8.rs
@@ -0,0 +1,2 @@
+// { dg-excess-errors "stream did not contain valid UTF-8" }
+�

Original file line number	Diff line number	Diff line change
`@@ -175,6 +175,8 @@ class Lexer`
`175`	`175`	`Lexer (Lexer &&other) = default;`
`176`	`176`	`Lexer &operator= (Lexer &&other) = default;`
`177`	`177`
	`178`	`+ bool input_source_is_valid_utf8 ();`
	`179`	`+`
`178`	`180`	`// Returns token n tokens ahead of current position.`
`179`	`181`	`const_TokenPtr peek_token (int n) { return token_queue.peek (n); }`
`180`	`182`	`// Peeks the current token.`
`@@ -217,9 +219,9 @@ class Lexer`
`217`	`219`
`218`	`220`	`Codepoint next_codepoint ()`
`219`	`221`	`{`
`220`		`- uint8_t input = next_byte ();`
	`222`	`+ uint32_t input = next_byte ();`
`221`	`223`
`222`		`- if ((int8_t) input == EOF)`
	`224`	`+ if ((int32_t) input == EOF)`
`223`	`225`	`return Codepoint::eof ();`
`224`	`226`	`else if (input < 128)`
`225`	`227`	`{`
`@@ -246,11 +248,13 @@ class Lexer`
`246`	`248`	`// 3 bytes or UTF-8 BOM`
`247`	`249`	`uint8_t input2 = next_byte ();`
`248`	`250`	`// If the second byte is equal to 0xBB then the input is no longer a`
`249`		`- // valid UTF-8 char.`
	`251`	`+ // valid UTF-8 char. Then, we check if the third byte makes up a UTF`
	`252`	`+ // BOM.`
`250`	`253`	`if (input == 0xEF && input2 == 0xBB)`
`251`	`254`	`{`
`252`	`255`	`uint8_t input3 = next_byte ();`
`253`	`256`	`if (input3 == 0xBF)`
	`257`	`+ // found BOM`
`254`	`258`	`return next_codepoint ();`
`255`	`259`	`else`
`256`	`260`	`return {0xFFFE};`
`@@ -289,8 +293,6 @@ class Lexer`
`289`	`293`	`}`
`290`	`294`	`else`
`291`	`295`	`{`
`292`		`- // rust_error_at (get_current_location (),`
`293`		`- // "invalid UTF-8 [SECND] (too long)");`
`294`	`296`	`return {0xFFFE};`
`295`	`297`	`}`
`296`	`298`	`}`
`@@ -362,8 +364,7 @@ class Lexer`
`362`	`364`	`{`
`363`	`365`	`if (offs >= buffer.size ())`
`364`	`366`	`return EOF;`
`365`		`-`
`366`		`- return buffer.at (offs++);`
	`367`	`+ return (uint8_t) buffer.at (offs++);`
`367`	`368`	`}`
`368`	`369`
`369`	`370`	`public:`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+// { dg-excess-errors "stream did not contain valid UTF-8" }`
	`2`	`+ÿ`