Tidy up CharStreams API. Add new doc/unicode.md

antlr · Mar 16, 2017 · 4f21686 · 4f21686
1 parent b467dc8
commit 4f21686
Show file tree

Hide file tree

Showing 10 changed files with 415 additions and 126 deletions.
diff --git a/doc/faq/general.md b/doc/faq/general.md
@@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c
 
 ```Java
 
-CharStream input = new ANTLRFileStream(args[0]);
+CharStream input = CharStreams.fromPath(Paths.get(args[0]));
 ExprLexer lexer = new ExprLexer(input);
 CommonTokenStream tokens = new CommonTokenStream(lexer);
 ExprParser parser = new ExprParser(tokens);

diff --git a/doc/interpreters.md b/doc/interpreters.md
@@ -30,7 +30,7 @@ public static ParseTree parse(String fileName,
     throws IOException
 {
     final Grammar g = Grammar.load(combinedGrammarFileName);
-    LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName));
+    LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName)));
     CommonTokenStream tokens = new CommonTokenStream(lexEngine);
     ParserInterpreter parser = g.createParserInterpreter(tokens);
     ParseTree t = parser.parse(g.getRule(startRule).index);
@@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse,
 {
     final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName);
     final Grammar pg = Grammar.load(parserGrammarFileName, lg);
-    ANTLRFileStream input = new ANTLRFileStream(fileNameToParse);
+    CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse));
     LexerInterpreter lexEngine = lg.createLexerInterpreter(input);
     CommonTokenStream tokens = new CommonTokenStream(lexEngine);
     ParserInterpreter parser = pg.createParserInterpreter(tokens);

diff --git a/doc/unicode.md b/doc/unicode.md
@@ -0,0 +1,68 @@
+# Lexers and Unicode text
+
+Until ANTLR 4.7, generated lexers only supported part of the Unicode standard
+(code points up to `U+FFFF`).
+
+With ANTLR 4.7 and later, lexers as well as all languages' runtimes
+support the full range of Unicode code points up to `U+10FFFF`, as
+long as the input `CharStream` is opened using `CharStreams.fromPath()`
+or the equivalent method for your runtime's language.
+
+# Unicode Code Points in Lexer Grammars
+
+To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point)
+in lexer grammars, use the `\u` string escape. For example, to create
+a lexer rule for a single Cyrillic character by creating a range from
+`U+0400` to `U+04FF`:
+
+```ANTLR
+CYRILLIC = ('\u0400'..'\u04FF');
+```
+
+Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax.
+For example, to create a lexer rule for a selection of smiley faces
+from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf):
+
+```ANTLR
+EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}');
+```
+
+Finally, lexer char sets can include Unicode properties:
+
+```ANTLR
+EMOJI = [\p{Emoji}];
+JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}];
+NOT_CYRILLIC = [\P{Script=Cyrillic}];
+```
+
+See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode
+escapes in lexer rules.
+
+# CharStreams and UTF-8
+
+If your lexer grammar contains code points larger than `U+FFFF`, your
+lexer client code must open the file using `CharStreams.fromPath()` or
+equivalent in your runtime's language, or input values larger than
+`U+FFFF` will *not* match.
+
+For backwards compatibility, the existing `ANTLRInputStream` and
+`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`.
+
+The existing `TestRig` command-line interface supports all Unicode
+code points.
+
+# Example
+
+If you have generated a lexer named `UnicodeLexer`:
+
+```Java
+public static void main(String[] args) {
+  CharStream charStream = CharStreams.fromPath(Paths.get(args[0]));
+  Lexer lexer = new UnicodeLexer(charStream);
+  CommonTokenStream tokens = new CommonTokenStream(lexer);
+  tokens.fill();
+  for (Token token : tokens.getTokens()) {
+    System.out.println("Got token: " + token.toString());
+  }
+}
+```
diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java
@@ -923,7 +923,7 @@ protected void writeTestFile(String parserName,
 			"\n" +
 			"public class Test {\n" +
 			"    public static void main(String[] args) throws Exception {\n" +
-			"        CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
+			"        CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
 			"        <lexerName> lex = new <lexerName>(input);\n" +
 			"        CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
 			"        <createParser>\n"+
@@ -980,7 +980,7 @@ protected void writeLexerTestFile(String lexerName, boolean showDFA) {
 			"\n" +
 			"public class Test {\n" +
 			"    public static void main(String[] args) throws Exception {\n" +
-			"        CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
+			"        CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
 			"        <lexerName> lex = new <lexerName>(input);\n" +
 			"        CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
 			"        tokens.fill();\n" +

diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java
@@ -10,6 +10,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
 
 import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.CharacterCodingException;
@@ -20,6 +21,7 @@
 
 import java.util.Arrays;
 
+import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CodePointCharStream;
 
@@ -36,74 +38,74 @@ public class TestCharStreams {
 	public ExpectedException thrown = ExpectedException.none();
 
 	@Test
-	public void createWithBMPStringHasExpectedSize() {
-		CodePointCharStream s = CharStreams.createWithString("hello");
+	public void fromBMPStringHasExpectedSize() {
+		CharStream s = CharStreams.fromString("hello");
 		assertEquals(5, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello", s.toString());
 	}
 
 	@Test
-	public void createWithSMPStringHasExpectedSize() {
-		CodePointCharStream s = CharStreams.createWithString(
+	public void fromSMPStringHasExpectedSize() {
+		CharStream s = CharStreams.fromString(
 				"hello \uD83C\uDF0E");
 		assertEquals(7, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello \uD83C\uDF0E", s.toString());
 	}
 
 	@Test
-	public void createWithBMPUTF8PathHasExpectedSize() throws Exception {
+	public void fromBMPUTF8PathHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
-		CodePointCharStream s = CharStreams.createWithUTF8(p);
+		CharStream s = CharStreams.fromPath(p);
 		assertEquals(5, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello", s.toString());
 		assertEquals(p.toString(), s.getSourceName());
 	}
 
 	@Test
-	public void createWithSMPUTF8PathHasExpectedSize() throws Exception {
+	public void fromSMPUTF8PathHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
-		CodePointCharStream s = CharStreams.createWithUTF8(p);
+		CharStream s = CharStreams.fromPath(p);
 		assertEquals(7, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello \uD83C\uDF0E", s.toString());
 		assertEquals(p.toString(), s.getSourceName());
 	}
 
 	@Test
-	public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception {
+	public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
 		try (InputStream is = Files.newInputStream(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
+			CharStream s = CharStreams.fromStream(is);
 			assertEquals(5, s.size());
 			assertEquals(0, s.index());
 			assertEquals("hello", s.toString());
 		}
 	}
 
 	@Test
-	public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception {
+	public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (InputStream is = Files.newInputStream(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
+			CharStream s = CharStreams.fromStream(is);
 			assertEquals(7, s.size());
 			assertEquals(0, s.index());
 			assertEquals("hello \uD83C\uDF0E", s.toString());
 		}
 	}
 
 	@Test
-	public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
+	public void fromBMPUTF8ChannelHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(5, s.size());
 			assertEquals(0, s.index());
@@ -113,11 +115,11 @@ public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
 	}
 
 	@Test
-	public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
+	public void fromSMPUTF8ChannelHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(7, s.size());
 			assertEquals(0, s.index());
@@ -127,13 +129,13 @@ public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
 	}
 
 	@Test
-	public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
+	public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
 		throws Exception {
 		Path p = folder.newFile().toPath();
 		byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED };
 		Files.write(p, toWrite);
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(3, s.size());
 			assertEquals(0, s.index());
@@ -142,22 +144,22 @@ public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode(
 	}
 
 	@Test
-	public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception {
+	public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception {
 		Path p = folder.newFile().toPath();
 		byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE };
 		Files.write(p, toWrite);
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
 			thrown.expect(CharacterCodingException.class);
-			CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo");
+			CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo");
 		}
 	}
 
 	@Test
-	public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
+	public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c,
 					// Note this buffer size ensures the SMP code point
 					// straddles the boundary of two buffers
@@ -169,4 +171,40 @@ public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception
 			assertEquals("hello \uD83C\uDF0E", s.toString());
 		}
 	}
+
+	@Test
+	public void fromFileName() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
+		CharStream s = CharStreams.fromFileName(p.toString());
+		assertEquals(7, s.size());
+		assertEquals(0, s.index());
+		assertEquals("hello \uD83C\uDF0E", s.toString());
+		assertEquals(p.toString(), s.getSourceName());
+
+	}
+
+	@Test
+	public void fromFileNameWithLatin1() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1));
+		CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1);
+		assertEquals(8, s.size());
+		assertEquals(0, s.index());
+		assertEquals("hello \u00CA\u00FE", s.toString());
+		assertEquals(p.toString(), s.getSourceName());
+
+	}
+
+	@Test
+	public void fromReader() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
+		try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
+			CharStream s = CharStreams.fromReader(r);
+			assertEquals(7, s.size());
+			assertEquals(0, s.index());
+			assertEquals("hello \uD83C\uDF0E", s.toString());
+		}
+	}
 }