Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-96268: Fix loading invalid UTF-8 #96270

Merged
merged 11 commits into from
Sep 7, 2022
13 changes: 10 additions & 3 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,16 +236,23 @@ def test_invalid_utf8(self):
# test it is to write actual files to disk.

# Each example is put inside a string at the top of the file so
# it's an otherwise valid Python source file.
template = b'"%s"\n'
# it's an otherwise valid Python source file. Put some newlines
# beforehand so we can assert that the error is reported on the
# correct line.
template = b'\n\n\n"%s"\n'

fn = TESTFN
self.addCleanup(unlink, fn)

def check(content):
with open(fn, 'wb') as fp:
fp.write(template % content)
script_helper.assert_python_failure(fn)
rc, stdout, stderr = script_helper.assert_python_failure(fn)
# We want to assert that the python subprocess failed gracefully,
# not via a signal.
self.assertGreaterEqual(rc, 1)
self.assertTrue(b"Non-UTF-8 code starting with" in stderr)
mdboom marked this conversation as resolved.
Show resolved Hide resolved
self.assertTrue(b"on line 5" in stderr)
mdboom marked this conversation as resolved.
Show resolved Hide resolved

# continuation bytes in a sequence of 2, 3, or 4 bytes
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Loading a file with invalid UTF-8 will now report the broken character at
the correct location.
30 changes: 21 additions & 9 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) {

/* Check whether the characters at s start a valid
UTF-8 sequence. Return the number of characters forming
the sequence if yes, 0 if not. */
the sequence if yes, 0 if not. The special cases match
those in stringlib/codecs.h:decode_utf8.
mdboom marked this conversation as resolved.
Show resolved Hide resolved
*/
static int valid_utf8(const unsigned char* s)
mdboom marked this conversation as resolved.
Show resolved Hide resolved
{
int expected = 0;
int length;
if (*s < 0x80)
if (*s < 0x80) {
/* single-byte code */
return 1;
if (*s < 0xc0)
/* following byte */
return 0;
if (*s < 0xE0)
} else if (*s < 0xE0) {
if (*s < 0xC2) {
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
return 0;
}
expected = 1;
else if (*s < 0xF0)
} else if (*s < 0xF0) {
if (*s == 0xE0 && *(s + 1) < 0xA0) {
return 0;
} else if (*s == 0xED && *(s + 1) >= 0xA0) {
return 0;
}
expected = 2;
else if (*s < 0xF8)
} else if (*s < 0xF5) {
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
return 0;
}
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
expected = 3;
else
} else {
/* invalid start byte */
return 0;
}
length = expected + 1;
for (; expected; expected--)
if (s[expected] < 0x80 || s[expected] >= 0xC0)
Expand Down