Skip to content

Commit

Permalink
Add comments about handled code ranges in each branch
Browse files Browse the repository at this point in the history
  • Loading branch information
mdboom committed Aug 31, 2022
1 parent 18927b1 commit f741a9d
Showing 1 changed file with 16 additions and 0 deletions.
16 changes: 16 additions & 0 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,19 +501,35 @@ valid_utf8(const unsigned char* s)
/* single-byte code */
return 1;
} else if (*s < 0xE0) {
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
if (*s < 0xC2) {
/* invalid sequence
\x80-\xBF -- continuation byte
\xC0-\xC1 -- fake 0000-007F */
return 0;
}
expected = 1;
} else if (*s < 0xF0) {
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
if (*s == 0xE0 && *(s + 1) < 0xA0) {
/* invalid sequence
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
return 0;
} else if (*s == 0xED && *(s + 1) >= 0xA0) {
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
will result in surrogates in range D800-DFFF. Surrogates are
not valid UTF-8 so they are rejected.
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
return 0;
}
expected = 2;
} else if (*s < 0xF5) {
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
/* invalid sequence -- one of:
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
\xF4\x90\x80\x80- -- 110000- overflow */
return 0;
}
expected = 3;
Expand Down

0 comments on commit f741a9d

Please sign in to comment.