diff --git a/doc/faq/general.md b/doc/faq/general.md index 9189ac9abe..b7d8dd82e3 100644 --- a/doc/faq/general.md +++ b/doc/faq/general.md @@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c ```Java -CharStream input = new ANTLRFileStream(args[0]); +CharStream input = CharStreams.fromPath(Paths.get(args[0])); ExprLexer lexer = new ExprLexer(input); CommonTokenStream tokens = new CommonTokenStream(lexer); ExprParser parser = new ExprParser(tokens); diff --git a/doc/interpreters.md b/doc/interpreters.md index c99e6d5805..c387ae6b2e 100644 --- a/doc/interpreters.md +++ b/doc/interpreters.md @@ -30,7 +30,7 @@ public static ParseTree parse(String fileName, throws IOException { final Grammar g = Grammar.load(combinedGrammarFileName); - LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName)); + LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName))); CommonTokenStream tokens = new CommonTokenStream(lexEngine); ParserInterpreter parser = g.createParserInterpreter(tokens); ParseTree t = parser.parse(g.getRule(startRule).index); @@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse, { final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName); final Grammar pg = Grammar.load(parserGrammarFileName, lg); - ANTLRFileStream input = new ANTLRFileStream(fileNameToParse); + CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse)); LexerInterpreter lexEngine = lg.createLexerInterpreter(input); CommonTokenStream tokens = new CommonTokenStream(lexEngine); ParserInterpreter parser = pg.createParserInterpreter(tokens); diff --git a/doc/unicode.md b/doc/unicode.md new file mode 100644 index 0000000000..2fdbf81b1b --- /dev/null +++ b/doc/unicode.md @@ -0,0 +1,68 @@ +# Lexers and Unicode text + +Until ANTLR 4.7, generated lexers only supported part of the Unicode standard +(code points up to `U+FFFF`). + +With ANTLR 4.7 and later, lexers as well as all languages' runtimes +support the full range of Unicode code points up to `U+10FFFF`, as +long as the input `CharStream` is opened using `CharStreams.fromPath()` +or the equivalent method for your runtime's language. + +# Unicode Code Points in Lexer Grammars + +To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point) +in lexer grammars, use the `\u` string escape. For example, to create +a lexer rule for a single Cyrillic character by creating a range from +`U+0400` to `U+04FF`: + +```ANTLR +CYRILLIC = ('\u0400'..'\u04FF'); +``` + +Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax. +For example, to create a lexer rule for a selection of smiley faces +from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf): + +```ANTLR +EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}'); +``` + +Finally, lexer char sets can include Unicode properties: + +```ANTLR +EMOJI = [\p{Emoji}]; +JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}]; +NOT_CYRILLIC = [\P{Script=Cyrillic}]; +``` + +See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode +escapes in lexer rules. + +# CharStreams and UTF-8 + +If your lexer grammar contains code points larger than `U+FFFF`, your +lexer client code must open the file using `CharStreams.fromPath()` or +equivalent in your runtime's language, or input values larger than +`U+FFFF` will *not* match. + +For backwards compatibility, the existing `ANTLRInputStream` and +`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`. + +The existing `TestRig` command-line interface supports all Unicode +code points. + +# Example + +If you have generated a lexer named `UnicodeLexer`: + +```Java +public static void main(String[] args) { + CharStream charStream = CharStreams.fromPath(Paths.get(args[0])); + Lexer lexer = new UnicodeLexer(charStream); + CommonTokenStream tokens = new CommonTokenStream(lexer); + tokens.fill(); + for (Token token : tokens.getTokens()) { + System.out.println("Got token: " + token.toString()); + } +} +``` diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java index d55a9b01fd..80e564e307 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java @@ -923,7 +923,7 @@ protected void writeTestFile(String parserName, "\n" + "public class Test {\n" + " public static void main(String[] args) throws Exception {\n" + - " CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" + + " CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" + " lex = new (input);\n" + " CommonTokenStream tokens = new CommonTokenStream(lex);\n" + " \n"+ @@ -980,7 +980,7 @@ protected void writeLexerTestFile(String lexerName, boolean showDFA) { "\n" + "public class Test {\n" + " public static void main(String[] args) throws Exception {\n" + - " CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" + + " CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" + " lex = new (input);\n" + " CommonTokenStream tokens = new CommonTokenStream(lex);\n" + " tokens.fill();\n" + diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java index f944fe5da5..aa106ab1c8 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java @@ -10,6 +10,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.Reader; import java.nio.channels.SeekableByteChannel; import java.nio.charset.CharacterCodingException; @@ -20,6 +21,7 @@ import java.util.Arrays; +import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.CharStreams; import org.antlr.v4.runtime.CodePointCharStream; @@ -36,16 +38,16 @@ public class TestCharStreams { public ExpectedException thrown = ExpectedException.none(); @Test - public void createWithBMPStringHasExpectedSize() { - CodePointCharStream s = CharStreams.createWithString("hello"); + public void fromBMPStringHasExpectedSize() { + CharStream s = CharStreams.fromString("hello"); assertEquals(5, s.size()); assertEquals(0, s.index()); assertEquals("hello", s.toString()); } @Test - public void createWithSMPStringHasExpectedSize() { - CodePointCharStream s = CharStreams.createWithString( + public void fromSMPStringHasExpectedSize() { + CharStream s = CharStreams.fromString( "hello \uD83C\uDF0E"); assertEquals(7, s.size()); assertEquals(0, s.index()); @@ -53,10 +55,10 @@ public void createWithSMPStringHasExpectedSize() { } @Test - public void createWithBMPUTF8PathHasExpectedSize() throws Exception { + public void fromBMPUTF8PathHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello".getBytes(StandardCharsets.UTF_8)); - CodePointCharStream s = CharStreams.createWithUTF8(p); + CharStream s = CharStreams.fromPath(p); assertEquals(5, s.size()); assertEquals(0, s.index()); assertEquals("hello", s.toString()); @@ -64,10 +66,10 @@ public void createWithBMPUTF8PathHasExpectedSize() throws Exception { } @Test - public void createWithSMPUTF8PathHasExpectedSize() throws Exception { + public void fromSMPUTF8PathHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); - CodePointCharStream s = CharStreams.createWithUTF8(p); + CharStream s = CharStreams.fromPath(p); assertEquals(7, s.size()); assertEquals(0, s.index()); assertEquals("hello \uD83C\uDF0E", s.toString()); @@ -75,11 +77,11 @@ public void createWithSMPUTF8PathHasExpectedSize() throws Exception { } @Test - public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception { + public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello".getBytes(StandardCharsets.UTF_8)); try (InputStream is = Files.newInputStream(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Stream(is); + CharStream s = CharStreams.fromStream(is); assertEquals(5, s.size()); assertEquals(0, s.index()); assertEquals("hello", s.toString()); @@ -87,11 +89,11 @@ public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception { } @Test - public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception { + public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); try (InputStream is = Files.newInputStream(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Stream(is); + CharStream s = CharStreams.fromStream(is); assertEquals(7, s.size()); assertEquals(0, s.index()); assertEquals("hello \uD83C\uDF0E", s.toString()); @@ -99,11 +101,11 @@ public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception { } @Test - public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception { + public void fromBMPUTF8ChannelHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello".getBytes(StandardCharsets.UTF_8)); try (SeekableByteChannel c = Files.newByteChannel(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Channel( + CharStream s = CharStreams.fromChannel( c, 4096, CodingErrorAction.REPLACE, "foo"); assertEquals(5, s.size()); assertEquals(0, s.index()); @@ -113,11 +115,11 @@ public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception { } @Test - public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception { + public void fromSMPUTF8ChannelHasExpectedSize() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); try (SeekableByteChannel c = Files.newByteChannel(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Channel( + CharStream s = CharStreams.fromChannel( c, 4096, CodingErrorAction.REPLACE, "foo"); assertEquals(7, s.size()); assertEquals(0, s.index()); @@ -127,13 +129,13 @@ public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception { } @Test - public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode() + public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode() throws Exception { Path p = folder.newFile().toPath(); byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED }; Files.write(p, toWrite); try (SeekableByteChannel c = Files.newByteChannel(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Channel( + CharStream s = CharStreams.fromChannel( c, 4096, CodingErrorAction.REPLACE, "foo"); assertEquals(3, s.size()); assertEquals(0, s.index()); @@ -142,22 +144,22 @@ public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode( } @Test - public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception { + public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception { Path p = folder.newFile().toPath(); byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE }; Files.write(p, toWrite); try (SeekableByteChannel c = Files.newByteChannel(p)) { thrown.expect(CharacterCodingException.class); - CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo"); + CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo"); } } @Test - public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception { + public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception { Path p = folder.newFile().toPath(); Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); try (SeekableByteChannel c = Files.newByteChannel(p)) { - CodePointCharStream s = CharStreams.createWithUTF8Channel( + CharStream s = CharStreams.fromChannel( c, // Note this buffer size ensures the SMP code point // straddles the boundary of two buffers @@ -169,4 +171,40 @@ public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception assertEquals("hello \uD83C\uDF0E", s.toString()); } } + + @Test + public void fromFileName() throws Exception { + Path p = folder.newFile().toPath(); + Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); + CharStream s = CharStreams.fromFileName(p.toString()); + assertEquals(7, s.size()); + assertEquals(0, s.index()); + assertEquals("hello \uD83C\uDF0E", s.toString()); + assertEquals(p.toString(), s.getSourceName()); + + } + + @Test + public void fromFileNameWithLatin1() throws Exception { + Path p = folder.newFile().toPath(); + Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1)); + CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1); + assertEquals(8, s.size()); + assertEquals(0, s.index()); + assertEquals("hello \u00CA\u00FE", s.toString()); + assertEquals(p.toString(), s.getSourceName()); + + } + + @Test + public void fromReader() throws Exception { + Path p = folder.newFile().toPath(); + Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8)); + try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) { + CharStream s = CharStreams.fromReader(r); + assertEquals(7, s.size()); + assertEquals(0, s.index()); + assertEquals("hello \uD83C\uDF0E", s.toString()); + } + } } diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java index 226e4a6137..5fe1037fed 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java @@ -26,21 +26,21 @@ public class TestCodePointCharStream { @Test public void emptyBytesHasSize0() { - CodePointCharStream s = CharStreams.createWithString(""); + CodePointCharStream s = CharStreams.fromString(""); assertEquals(0, s.size()); assertEquals(0, s.index()); } @Test public void emptyBytesLookAheadReturnsEOF() { - CodePointCharStream s = CharStreams.createWithString(""); + CodePointCharStream s = CharStreams.fromString(""); assertEquals(IntStream.EOF, s.LA(1)); assertEquals(0, s.index()); } @Test public void consumingEmptyStreamShouldThrow() { - CodePointCharStream s = CharStreams.createWithString(""); + CodePointCharStream s = CharStreams.fromString(""); thrown.expect(IllegalStateException.class); thrown.expectMessage("cannot consume EOF"); s.consume(); @@ -48,13 +48,13 @@ public void consumingEmptyStreamShouldThrow() { @Test public void singleLatinCodePointHasSize1() { - CodePointCharStream s = CharStreams.createWithString("X"); + CodePointCharStream s = CharStreams.fromString("X"); assertEquals(1, s.size()); } @Test public void consumingSingleLatinCodePointShouldMoveIndex() { - CodePointCharStream s = CharStreams.createWithString("X"); + CodePointCharStream s = CharStreams.fromString("X"); assertEquals(0, s.index()); s.consume(); assertEquals(1, s.index()); @@ -62,7 +62,7 @@ public void consumingSingleLatinCodePointShouldMoveIndex() { @Test public void consumingPastSingleLatinCodePointShouldThrow() { - CodePointCharStream s = CharStreams.createWithString("X"); + CodePointCharStream s = CharStreams.fromString("X"); s.consume(); thrown.expect(IllegalStateException.class); thrown.expectMessage("cannot consume EOF"); @@ -71,14 +71,14 @@ public void consumingPastSingleLatinCodePointShouldThrow() { @Test public void singleLatinCodePointLookAheadShouldReturnCodePoint() { - CodePointCharStream s = CharStreams.createWithString("X"); + CodePointCharStream s = CharStreams.fromString("X"); assertEquals('X', s.LA(1)); assertEquals(0, s.index()); } @Test public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() { - CodePointCharStream s = CharStreams.createWithString("XYZ"); + CodePointCharStream s = CharStreams.fromString("XYZ"); assertEquals('X', s.LA(1)); assertEquals(0, s.index()); assertEquals('Y', s.LA(2)); @@ -89,20 +89,20 @@ public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() { @Test public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() { - CodePointCharStream s = CharStreams.createWithString("X"); + CodePointCharStream s = CharStreams.fromString("X"); assertEquals(IntStream.EOF, s.LA(2)); } @Test public void singleCJKCodePointHasSize1() { - CodePointCharStream s = CharStreams.createWithString("\u611B"); + CodePointCharStream s = CharStreams.fromString("\u611B"); assertEquals(1, s.size()); assertEquals(0, s.index()); } @Test public void consumingSingleCJKCodePointShouldMoveIndex() { - CodePointCharStream s = CharStreams.createWithString("\u611B"); + CodePointCharStream s = CharStreams.fromString("\u611B"); assertEquals(0, s.index()); s.consume(); assertEquals(1, s.index()); @@ -110,7 +110,7 @@ public void consumingSingleCJKCodePointShouldMoveIndex() { @Test public void consumingPastSingleCJKCodePointShouldThrow() { - CodePointCharStream s = CharStreams.createWithString("\u611B"); + CodePointCharStream s = CharStreams.fromString("\u611B"); s.consume(); thrown.expect(IllegalStateException.class); thrown.expectMessage("cannot consume EOF"); @@ -119,21 +119,21 @@ public void consumingPastSingleCJKCodePointShouldThrow() { @Test public void singleCJKCodePointLookAheadShouldReturnCodePoint() { - CodePointCharStream s = CharStreams.createWithString("\u611B"); + CodePointCharStream s = CharStreams.fromString("\u611B"); assertEquals(0x611B, s.LA(1)); assertEquals(0, s.index()); } @Test public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() { - CodePointCharStream s = CharStreams.createWithString("\u611B"); + CodePointCharStream s = CharStreams.fromString("\u611B"); assertEquals(IntStream.EOF, s.LA(2)); assertEquals(0, s.index()); } @Test public void singleEmojiCodePointHasSize1() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder().appendCodePoint(0x1F4A9).toString()); assertEquals(1, s.size()); assertEquals(0, s.index()); @@ -141,7 +141,7 @@ public void singleEmojiCodePointHasSize1() { @Test public void consumingSingleEmojiCodePointShouldMoveIndex() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder().appendCodePoint(0x1F4A9).toString()); assertEquals(0, s.index()); s.consume(); @@ -150,7 +150,7 @@ public void consumingSingleEmojiCodePointShouldMoveIndex() { @Test public void consumingPastEndOfEmojiCodePointWithShouldThrow() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder().appendCodePoint(0x1F4A9).toString()); assertEquals(0, s.index()); s.consume(); @@ -162,7 +162,7 @@ public void consumingPastEndOfEmojiCodePointWithShouldThrow() { @Test public void singleEmojiCodePointLookAheadShouldReturnCodePoint() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder().appendCodePoint(0x1F4A9).toString()); assertEquals(0x1F4A9, s.LA(1)); assertEquals(0, s.index()); @@ -170,7 +170,7 @@ public void singleEmojiCodePointLookAheadShouldReturnCodePoint() { @Test public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder().appendCodePoint(0x1F4A9).toString()); assertEquals(IntStream.EOF, s.LA(2)); assertEquals(0, s.index()); @@ -178,19 +178,19 @@ public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() { @Test public void getTextWithLatin() { - CodePointCharStream s = CharStreams.createWithString("0123456789"); + CodePointCharStream s = CharStreams.fromString("0123456789"); assertEquals("34567", s.getText(Interval.of(3, 7))); } @Test public void getTextWithCJK() { - CodePointCharStream s = CharStreams.createWithString("01234\u40946789"); + CodePointCharStream s = CharStreams.fromString("01234\u40946789"); assertEquals("34\u409467", s.getText(Interval.of(3, 7))); } @Test public void getTextWithEmoji() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder("01234") .appendCodePoint(0x1F522) .append("6789") @@ -200,19 +200,19 @@ public void getTextWithEmoji() { @Test public void toStringWithLatin() { - CodePointCharStream s = CharStreams.createWithString("0123456789"); + CodePointCharStream s = CharStreams.fromString("0123456789"); assertEquals("0123456789", s.toString()); } @Test public void toStringWithCJK() { - CodePointCharStream s = CharStreams.createWithString("01234\u40946789"); + CodePointCharStream s = CharStreams.fromString("01234\u40946789"); assertEquals("01234\u40946789", s.toString()); } @Test public void toStringWithEmoji() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder("01234") .appendCodePoint(0x1F522) .append("6789") @@ -222,19 +222,19 @@ public void toStringWithEmoji() { @Test public void lookAheadWithLatin() { - CodePointCharStream s = CharStreams.createWithString("0123456789"); + CodePointCharStream s = CharStreams.fromString("0123456789"); assertEquals('5', s.LA(6)); } @Test public void lookAheadWithCJK() { - CodePointCharStream s = CharStreams.createWithString("01234\u40946789"); + CodePointCharStream s = CharStreams.fromString("01234\u40946789"); assertEquals(0x4094, s.LA(6)); } @Test public void lookAheadWithEmoji() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder("01234") .appendCodePoint(0x1F522) .append("6789") @@ -244,21 +244,21 @@ public void lookAheadWithEmoji() { @Test public void seekWithLatin() { - CodePointCharStream s = CharStreams.createWithString("0123456789"); + CodePointCharStream s = CharStreams.fromString("0123456789"); s.seek(5); assertEquals('5', s.LA(1)); } @Test public void seekWithCJK() { - CodePointCharStream s = CharStreams.createWithString("01234\u40946789"); + CodePointCharStream s = CharStreams.fromString("01234\u40946789"); s.seek(5); assertEquals(0x4094, s.LA(1)); } @Test public void seekWithEmoji() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder("01234") .appendCodePoint(0x1F522) .append("6789") @@ -269,21 +269,21 @@ public void seekWithEmoji() { @Test public void lookBehindWithLatin() { - CodePointCharStream s = CharStreams.createWithString("0123456789"); + CodePointCharStream s = CharStreams.fromString("0123456789"); s.seek(6); assertEquals('5', s.LA(-1)); } @Test public void lookBehindWithCJK() { - CodePointCharStream s = CharStreams.createWithString("01234\u40946789"); + CodePointCharStream s = CharStreams.fromString("01234\u40946789"); s.seek(6); assertEquals(0x4094, s.LA(-1)); } @Test public void lookBehindWithEmoji() { - CodePointCharStream s = CharStreams.createWithString( + CodePointCharStream s = CharStreams.fromString( new StringBuilder("01234") .appendCodePoint(0x1F522) .append("6789") diff --git a/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java b/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java index 566452a8db..222eab621f 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java +++ b/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java @@ -7,19 +7,27 @@ import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.nio.ByteBuffer; import java.nio.IntBuffer; +import java.nio.charset.Charset; import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; import java.nio.channels.Channels; import java.nio.channels.FileChannel; import java.nio.channels.ReadableByteChannel; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; /** - * Utility class to create {@link CodePointCharStream}s from - * various sources of Unicode data. + * Utility class to create {@link CharStream}s from various sources of + * string data. + * + * Main entry points are the factory methods {@code CharStreams.fromPath()}, + * {@code CharStreams.fromString()}, etc. */ public final class CharStreams { private static final int DEFAULT_BUFFER_SIZE = 4096; @@ -28,14 +36,215 @@ public final class CharStreams { private CharStreams() { } /** - * Convenience method to create a {@link CodePointCharStream} - * for the Unicode code points in a Java {@link String}. + * Creates a {@link CharStream} given a path to a UTF-8 + * encoded file on disk. + * + * Reads the entire contents of the file into the result before returning. */ - public static CodePointCharStream createWithString(String s) { - return createWithString(s, IntStream.UNKNOWN_SOURCE_NAME); + public static CharStream fromPath(Path path) throws IOException { + return fromPath(path, StandardCharsets.UTF_8); } - public static CodePointCharStream createWithString(String s, String sourceName) { + /** + * Creates a {@link CharStream} given a path to a file on disk and the + * charset of the bytes contained in the file. + * + * Reads the entire contents of the file into the result before returning. + * + * For sources encoded in UTF-8, supports the full Unicode code point + * range. + * + * For other sources, only supports Unicode code points up to U+FFFF. + */ + public static CharStream fromPath(Path path, Charset charset) throws IOException { + if (charset.equals(StandardCharsets.UTF_8)) { + try (ReadableByteChannel channel = Files.newByteChannel(path)) { + return fromChannel( + channel, + DEFAULT_BUFFER_SIZE, + CodingErrorAction.REPLACE, + path.toString()); + } + } else { + return new ANTLRFileStream(path.toString(), charset.toString()); + } + } + + /** + * Creates a {@link CharStream} given a string containing a + * path to a UTF-8 file on disk. + * + * Reads the entire contents of the file into the result before returning. + */ + public static CharStream fromFileName(String fileName) throws IOException { + return fromPath(Paths.get(fileName), StandardCharsets.UTF_8); + } + + /** + * Creates a {@link CharStream} given a string containing a + * path to a file on disk and the charset of the bytes + * contained in the file. + * + * Reads the entire contents of the file into the result before returning. + * + * For sources encoded in UTF-8, supports the full Unicode code point + * range. + * + * For other sources, only supports Unicode code points up to U+FFFF. + */ + public static CharStream fromFileName(String fileName, Charset charset) throws IOException { + return fromPath(Paths.get(fileName), charset); + } + + + /** + * Creates a {@link CharStream} given an opened {@link InputStream} + * containing UTF-8 bytes. + * + * Reads the entire contents of the {@code InputStream} into + * the result before returning, then closes the {@code InputStream}. + */ + public static CharStream fromStream(InputStream is) throws IOException { + return fromStream(is, StandardCharsets.UTF_8); + } + +/** + * Creates a {@link CharStream} given an opened {@link InputStream} and the + * charset of the bytes contained in the stream. + * + * Reads the entire contents of the {@code InputStream} into + * the result before returning, then closes the {@code InputStream}. + * + * For sources encoded in UTF-8, supports the full Unicode code point + * range. + * + * For other sources, only supports Unicode code points up to U+FFFF. + */ + public static CharStream fromStream(InputStream is, Charset charset) throws IOException { + if (charset.equals(StandardCharsets.UTF_8)) { + try (ReadableByteChannel channel = Channels.newChannel(is)) { + return fromChannel( + channel, + DEFAULT_BUFFER_SIZE, + CodingErrorAction.REPLACE, + IntStream.UNKNOWN_SOURCE_NAME); + } + } else { + try (InputStreamReader isr = new InputStreamReader(is, charset)) { + return new ANTLRInputStream(isr); + } + } + } + + /** + * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} + * containing UTF-8 bytes. + * + * Reads the entire contents of the {@code channel} into + * the result before returning, then closes the {@code channel}. + */ + public static CharStream fromChannel(ReadableByteChannel channel) throws IOException { + return fromChannel(channel, StandardCharsets.UTF_8); + } + + /** + * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the + * charset of the bytes contained in the channel. + * + * Reads the entire contents of the {@code channel} into + * the result before returning, then closes the {@code channel}. + * + * For sources encoded in UTF-8, supports the full Unicode code point + * range. + * + * For other sources, only supports Unicode code points up to U+FFFF. + */ + public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException { + if (charset.equals(StandardCharsets.UTF_8)) { + return fromChannel( + channel, + DEFAULT_BUFFER_SIZE, + CodingErrorAction.REPLACE, + IntStream.UNKNOWN_SOURCE_NAME); + } else { + try (InputStream is = Channels.newInputStream(channel); + InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) { + return new ANTLRInputStream(isr); + } + } + } + + /** + * Creates a {@link CharStream} given a {@link Reader}. Closes + * the reader before returning. + */ + public static CodePointCharStream fromReader(Reader r) throws IOException { + return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME); + } + + /** + * Creates a {@link CharStream} given a {@link Reader} and its + * source name. Closes the reader before returning. + */ + public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException { + IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE); + int highSurrogate = -1; + int curCodeUnit; + try { + while ((curCodeUnit = r.read()) != -1) { + if (!codePointBuffer.hasRemaining()) { + // Grow the code point buffer size by 2. + IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2); + codePointBuffer.flip(); + newBuffer.put(codePointBuffer); + codePointBuffer = newBuffer; + } + if (Character.isHighSurrogate((char) curCodeUnit)) { + if (highSurrogate != -1) { + // Dangling high surrogate followed by another high surrogate. + codePointBuffer.put(highSurrogate); + } + highSurrogate = curCodeUnit; + } else if (Character.isLowSurrogate((char) curCodeUnit)) { + if (highSurrogate == -1) { + // Low surrogate not preceded by high surrogate. + codePointBuffer.put(curCodeUnit); + } else { + codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit)); + highSurrogate = -1; + } + } else { + if (highSurrogate != -1) { + // Dangling high surrogate followed by a non-surrogate. + codePointBuffer.put(highSurrogate); + highSurrogate = -1; + } + codePointBuffer.put(curCodeUnit); + } + } + if (highSurrogate != -1) { + // Dangling high surrogate at end of file. + codePointBuffer.put(highSurrogate); + } + codePointBuffer.flip(); + return new CodePointCharStream(codePointBuffer, sourceName); + } finally { + r.close(); + } + } + + /** + * Creates a {@link CharStream} given a {@link String}. + */ + public static CodePointCharStream fromString(String s) { + return fromString(s, IntStream.UNKNOWN_SOURCE_NAME); + } + + /** + * Creates a {@link CharStream} given a {@link String} and the {@code sourceName} + * from which it came. + */ + public static CodePointCharStream fromString(String s, String sourceName) { // Initial guess assumes no code points > U+FFFF: one code // point for each code unit in the string IntBuffer codePointBuffer = IntBuffer.allocate(s.length()); @@ -56,48 +265,39 @@ public static CodePointCharStream createWithString(String s, String sourceName) return new CodePointCharStream(codePointBuffer, sourceName); } - public static CodePointCharStream createWithUTF8(Path path) throws IOException { - try (ReadableByteChannel channel = Files.newByteChannel(path)) { - return createWithUTF8Channel( - channel, - DEFAULT_BUFFER_SIZE, - CodingErrorAction.REPLACE, - path.toString()); - } - } - - public static CodePointCharStream createWithUTF8Stream(InputStream is) throws IOException { - try (ReadableByteChannel channel = Channels.newChannel(is)) { - return createWithUTF8Channel( - channel, - DEFAULT_BUFFER_SIZE, - CodingErrorAction.REPLACE, - IntStream.UNKNOWN_SOURCE_NAME); - } - } - - public static CodePointCharStream createWithUTF8Channel( + /** + * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} + * containing UTF-8 bytes. + * + * Reads the entire contents of the {@code channel} into + * the result before returning, then closes the {@code channel}. + */ + public static CodePointCharStream fromChannel( ReadableByteChannel channel, int bufferSize, CodingErrorAction decodingErrorAction, String sourceName ) throws IOException { - ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize); - IntBuffer codePointsOut = IntBuffer.allocate(bufferSize); - boolean endOfInput = false; - UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction); - while (!endOfInput) { - int bytesRead = channel.read(utf8BytesIn); - endOfInput = (bytesRead == -1); - utf8BytesIn.flip(); - codePointsOut = decoder.decodeCodePointsFromBuffer( - utf8BytesIn, - codePointsOut, - endOfInput); - utf8BytesIn.compact(); + try { + ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize); + IntBuffer codePointsOut = IntBuffer.allocate(bufferSize); + boolean endOfInput = false; + UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction); + while (!endOfInput) { + int bytesRead = channel.read(utf8BytesIn); + endOfInput = (bytesRead == -1); + utf8BytesIn.flip(); + codePointsOut = decoder.decodeCodePointsFromBuffer( + utf8BytesIn, + codePointsOut, + endOfInput); + utf8BytesIn.compact(); + } + codePointsOut.limit(codePointsOut.position()); + codePointsOut.flip(); + return new CodePointCharStream(codePointsOut, sourceName); + } finally { + channel.close(); } - codePointsOut.limit(codePointsOut.position()); - codePointsOut.flip(); - return new CodePointCharStream(codePointsOut, sourceName); } } diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java index 966416470b..8aa8e9e988 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java @@ -381,7 +381,7 @@ public void testSetUp() throws Exception { protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { ATN atn = createATN(lg, true); - CharStream input = CharStreams.createWithString(inputString); + CharStream input = CharStreams.fromString(inputString); ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE"); DOTGenerator dot = new DOTGenerator(lg); // System.out.println(dot.getDOT(startState, true)); diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java index 640e8ced6e..291f146cc6 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java @@ -161,7 +161,7 @@ private static String parseTreeForGrammarWithInput( String inputText) throws Exception { Grammar grammar = new Grammar(grammarText); LexerInterpreter lexEngine = grammar.createLexerInterpreter( - CharStreams.createWithString(inputText)); + CharStreams.fromString(inputText)); CommonTokenStream tokens = new CommonTokenStream(lexEngine); GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens); ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index); diff --git a/tool/src/org/antlr/v4/gui/TestRig.java b/tool/src/org/antlr/v4/gui/TestRig.java index 7630af98f0..074ab301e0 100644 --- a/tool/src/org/antlr/v4/gui/TestRig.java +++ b/tool/src/org/antlr/v4/gui/TestRig.java @@ -26,7 +26,6 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; @@ -157,28 +156,12 @@ public void process() throws Exception { Charset charset = ( encoding == null ? Charset.defaultCharset () : Charset.forName(encoding) ); if ( inputFiles.size()==0 ) { - CharStream charStream; - if ( charset.equals(StandardCharsets.UTF_8)) { - charStream = CharStreams.createWithUTF8Stream(System.in); - } - else { - try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) { - charStream = new ANTLRInputStream(r); - } - } + CharStream charStream = CharStreams.fromStream(System.in, charset); process(lexer, parserClass, parser, charStream); return; } for (String inputFile : inputFiles) { - CharStream charStream; - if ( charset.equals(StandardCharsets.UTF_8) ) { - charStream = CharStreams.createWithUTF8(Paths.get(inputFile)); - } - else { - try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) { - charStream = new ANTLRInputStream(r); - } - } + CharStream charStream = CharStreams.fromPath(Paths.get(inputFile), charset); if ( inputFiles.size()>1 ) { System.err.println(inputFile); }