Skip to content

Commit 46a61f0

Browse files
tamaroningphilberty
authored andcommitted
gccrs: add utf-8 validation for input source
gcc/rust/ChangeLog: * lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`. * lex/rust-lex.h: Likewise. * rust-session-manager.cc (Session::compile_crate): Add error. gcc/testsuite/ChangeLog: * rust/compile/broken_utf8.rs: New test. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 5e735e9 commit 46a61f0

File tree

4 files changed

+24
-7
lines changed

4 files changed

+24
-7
lines changed

gcc/rust/lex/rust-lex.cc

+6
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,12 @@ Lexer::~Lexer ()
167167
// line_map->stop();
168168
}
169169

170+
bool
171+
Lexer::input_source_is_valid_utf8 ()
172+
{
173+
return raw_input_source->is_valid ();
174+
}
175+
170176
/* TODO: need to optimise somehow to avoid the virtual function call in the
171177
* tight loop. Best idea at the moment is CRTP, but that might make lexer
172178
* implementation annoying when storing the "base class" (i.e. would need

gcc/rust/lex/rust-lex.h

+8-7
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ class Lexer
175175
Lexer (Lexer &&other) = default;
176176
Lexer &operator= (Lexer &&other) = default;
177177

178+
bool input_source_is_valid_utf8 ();
179+
178180
// Returns token n tokens ahead of current position.
179181
const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
180182
// Peeks the current token.
@@ -217,9 +219,9 @@ class Lexer
217219

218220
Codepoint next_codepoint ()
219221
{
220-
uint8_t input = next_byte ();
222+
uint32_t input = next_byte ();
221223

222-
if ((int8_t) input == EOF)
224+
if ((int32_t) input == EOF)
223225
return Codepoint::eof ();
224226
else if (input < 128)
225227
{
@@ -246,11 +248,13 @@ class Lexer
246248
// 3 bytes or UTF-8 BOM
247249
uint8_t input2 = next_byte ();
248250
// If the second byte is equal to 0xBB then the input is no longer a
249-
// valid UTF-8 char.
251+
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
252+
// BOM.
250253
if (input == 0xEF && input2 == 0xBB)
251254
{
252255
uint8_t input3 = next_byte ();
253256
if (input3 == 0xBF)
257+
// found BOM
254258
return next_codepoint ();
255259
else
256260
return {0xFFFE};
@@ -289,8 +293,6 @@ class Lexer
289293
}
290294
else
291295
{
292-
// rust_error_at (get_current_location (),
293-
// "invalid UTF-8 [SECND] (too long)");
294296
return {0xFFFE};
295297
}
296298
}
@@ -362,8 +364,7 @@ class Lexer
362364
{
363365
if (offs >= buffer.size ())
364366
return EOF;
365-
366-
return buffer.at (offs++);
367+
return (uint8_t) buffer.at (offs++);
367368
}
368369

369370
public:

gcc/rust/rust-session-manager.cc

+8
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)
497497

498498
Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);
499499

500+
if (!lex.input_source_is_valid_utf8 ())
501+
{
502+
rust_error_at (Linemap::unknown_location (),
503+
"cannot read %s; stream did not contain valid UTF-8",
504+
filename);
505+
return;
506+
}
507+
500508
Parser<Lexer> parser (lex);
501509

502510
// generate crate from parser
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// { dg-excess-errors "stream did not contain valid UTF-8" }
2+
ÿ

0 commit comments

Comments
 (0)