Skip to content

Commit

Permalink
Tidy up CharStreams API. Add new doc/unicode.md
Browse files Browse the repository at this point in the history
  • Loading branch information
bhamiltoncx committed Mar 16, 2017
1 parent b467dc8 commit 4f21686
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 126 deletions.
2 changes: 1 addition & 1 deletion doc/faq/general.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c

```Java

CharStream input = new ANTLRFileStream(args[0]);
CharStream input = CharStreams.fromPath(Paths.get(args[0]));
ExprLexer lexer = new ExprLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
ExprParser parser = new ExprParser(tokens);
Expand Down
4 changes: 2 additions & 2 deletions doc/interpreters.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static ParseTree parse(String fileName,
throws IOException
{
final Grammar g = Grammar.load(combinedGrammarFileName);
LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName));
LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName)));
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
ParserInterpreter parser = g.createParserInterpreter(tokens);
ParseTree t = parser.parse(g.getRule(startRule).index);
Expand Down Expand Up @@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse,
{
final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName);
final Grammar pg = Grammar.load(parserGrammarFileName, lg);
ANTLRFileStream input = new ANTLRFileStream(fileNameToParse);
CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse));
LexerInterpreter lexEngine = lg.createLexerInterpreter(input);
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
ParserInterpreter parser = pg.createParserInterpreter(tokens);
Expand Down
68 changes: 68 additions & 0 deletions doc/unicode.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Lexers and Unicode text

Until ANTLR 4.7, generated lexers only supported part of the Unicode standard
(code points up to `U+FFFF`).

With ANTLR 4.7 and later, lexers as well as all languages' runtimes
support the full range of Unicode code points up to `U+10FFFF`, as
long as the input `CharStream` is opened using `CharStreams.fromPath()`
or the equivalent method for your runtime's language.

# Unicode Code Points in Lexer Grammars

To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point)
in lexer grammars, use the `\u` string escape. For example, to create
a lexer rule for a single Cyrillic character by creating a range from
`U+0400` to `U+04FF`:

```ANTLR
CYRILLIC = ('\u0400'..'\u04FF');
```

Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax.
For example, to create a lexer rule for a selection of smiley faces
from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf):

```ANTLR
EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}');
```

Finally, lexer char sets can include Unicode properties:

```ANTLR
EMOJI = [\p{Emoji}];
JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}];
NOT_CYRILLIC = [\P{Script=Cyrillic}];
```

See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode
escapes in lexer rules.

# CharStreams and UTF-8

If your lexer grammar contains code points larger than `U+FFFF`, your
lexer client code must open the file using `CharStreams.fromPath()` or
equivalent in your runtime's language, or input values larger than
`U+FFFF` will *not* match.

For backwards compatibility, the existing `ANTLRInputStream` and
`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`.

The existing `TestRig` command-line interface supports all Unicode
code points.

# Example

If you have generated a lexer named `UnicodeLexer`:

```Java
public static void main(String[] args) {
CharStream charStream = CharStreams.fromPath(Paths.get(args[0]));
Lexer lexer = new UnicodeLexer(charStream);
CommonTokenStream tokens = new CommonTokenStream(lexer);
tokens.fill();
for (Token token : tokens.getTokens()) {
System.out.println("Got token: " + token.toString());
}
}
```
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,7 @@ protected void writeTestFile(String parserName,
"\n" +
"public class Test {\n" +
" public static void main(String[] args) throws Exception {\n" +
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
" <lexerName> lex = new <lexerName>(input);\n" +
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
" <createParser>\n"+
Expand Down Expand Up @@ -980,7 +980,7 @@ protected void writeLexerTestFile(String lexerName, boolean showDFA) {
"\n" +
"public class Test {\n" +
" public static void main(String[] args) throws Exception {\n" +
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
" <lexerName> lex = new <lexerName>(input);\n" +
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
" tokens.fill();\n" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

import java.nio.channels.SeekableByteChannel;
import java.nio.charset.CharacterCodingException;
Expand All @@ -20,6 +21,7 @@

import java.util.Arrays;

import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CodePointCharStream;

Expand All @@ -36,74 +38,74 @@ public class TestCharStreams {
public ExpectedException thrown = ExpectedException.none();

@Test
public void createWithBMPStringHasExpectedSize() {
CodePointCharStream s = CharStreams.createWithString("hello");
public void fromBMPStringHasExpectedSize() {
CharStream s = CharStreams.fromString("hello");
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
}

@Test
public void createWithSMPStringHasExpectedSize() {
CodePointCharStream s = CharStreams.createWithString(
public void fromSMPStringHasExpectedSize() {
CharStream s = CharStreams.fromString(
"hello \uD83C\uDF0E");
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
}

@Test
public void createWithBMPUTF8PathHasExpectedSize() throws Exception {
public void fromBMPUTF8PathHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
CodePointCharStream s = CharStreams.createWithUTF8(p);
CharStream s = CharStreams.fromPath(p);
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
assertEquals(p.toString(), s.getSourceName());
}

@Test
public void createWithSMPUTF8PathHasExpectedSize() throws Exception {
public void fromSMPUTF8PathHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
CodePointCharStream s = CharStreams.createWithUTF8(p);
CharStream s = CharStreams.fromPath(p);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
assertEquals(p.toString(), s.getSourceName());
}

@Test
public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception {
public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
try (InputStream is = Files.newInputStream(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
CharStream s = CharStreams.fromStream(is);
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
}
}

@Test
public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception {
public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (InputStream is = Files.newInputStream(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
CharStream s = CharStreams.fromStream(is);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
}
}

@Test
public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
public void fromBMPUTF8ChannelHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(5, s.size());
assertEquals(0, s.index());
Expand All @@ -113,11 +115,11 @@ public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
}

@Test
public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
public void fromSMPUTF8ChannelHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(7, s.size());
assertEquals(0, s.index());
Expand All @@ -127,13 +129,13 @@ public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
}

@Test
public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
throws Exception {
Path p = folder.newFile().toPath();
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED };
Files.write(p, toWrite);
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(3, s.size());
assertEquals(0, s.index());
Expand All @@ -142,22 +144,22 @@ public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode(
}

@Test
public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception {
public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception {
Path p = folder.newFile().toPath();
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE };
Files.write(p, toWrite);
try (SeekableByteChannel c = Files.newByteChannel(p)) {
thrown.expect(CharacterCodingException.class);
CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo");
CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo");
}
}

@Test
public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c,
// Note this buffer size ensures the SMP code point
// straddles the boundary of two buffers
Expand All @@ -169,4 +171,40 @@ public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception
assertEquals("hello \uD83C\uDF0E", s.toString());
}
}

@Test
public void fromFileName() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
CharStream s = CharStreams.fromFileName(p.toString());
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
assertEquals(p.toString(), s.getSourceName());

}

@Test
public void fromFileNameWithLatin1() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1));
CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1);
assertEquals(8, s.size());
assertEquals(0, s.index());
assertEquals("hello \u00CA\u00FE", s.toString());
assertEquals(p.toString(), s.getSourceName());

}

@Test
public void fromReader() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
CharStream s = CharStreams.fromReader(r);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
}
}
}
Loading

0 comments on commit 4f21686

Please sign in to comment.