forked from rust-lang/rust
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add implementation of UAX#29 word bounds algorithm
This patch does the following: 1. Adds three new structs in libunicode/str.rs: a. UnicodeWords: a filter on the UWordBounds iterator that yields only the "words" of a string as defined in Section 4 of Unicode Standard Annex rust-lang#29 (UAX#29), http://unicode.org/reports/tr29/#Word_Boundaries b. UWordBounds: an iterator that segments a string on its word boundaries as defined in UAX#29. Note that this *only* segments the string, and does *not* drop whitespace and other non-word pieces of the text (that's what UnicodeWords does). Note that UWordBounds has both a forward and backward iterator that have total running time (that is, to segment the entire string) linear in the size of the string. It should be noted that with pathological inputs the reverse iterator could be about 2x less efficient than the forward iterator, but on reasonable inputs their costs are similar. c. UWordBoundIndices: the above iterator, but returning tuples of (offset, &str). 2. Adds three new functions in the `UnicodeStr` trait: a. words_unicode(): returns a UnicodeWords iterator. b. split_words_uax29(): returns a UWordBounds iterator. c. split_words_uax29_indices(): returns a UWordBoundIndices iterator. 3. Updates the `src/etc/unicode.py` script to generate tables necessary for running the UWordBounds iterators. 4. Adds a new script, `src/etc/unicode_gen_breaktests.py`, which processes the grapheme and word break tests published by the Unicode consortium into a format for inclusion in libcollectionstest. 5. Adds new impls in libcollections's `str` corresponding to the `UnicodeStr` functions of (2). Note that this new functionality is gated with `feature(unicode)`. 6. Adds tests in libcollectionstest to exercise this new functionality. In addition, updates the test data for the graphemes test to correspond to the output from the script of (4). (Note that at the moment this change is primarily cosmetic.) This patch does not settle the question raised by @huonw in rust-lang#15628; rather, it introduces a new function alongside `words()` that follows UAX#29. In addition, it does not address the concerns that @SimonSapin raises in rust-lang/rfcs#1054 since it leaves `words()` alone.
- Loading branch information
Showing
7 changed files
with
2,596 additions
and
374 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 | ||
# | ||
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT | ||
# file at the top-level directory of this distribution and at | ||
# http://rust-lang.org/COPYRIGHT. | ||
# | ||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
# option. This file may not be copied, modified, or distributed | ||
# except according to those terms. | ||
|
||
# This script uses the following Unicode tables: | ||
# - GraphemeBreakTest.txt | ||
# - WordBreakTest.txt | ||
# | ||
# Since this should not require frequent updates, we just store this | ||
# out-of-line and check the unicode.rs file into git. | ||
|
||
import unicode, re, os, fileinput | ||
|
||
def load_test_data(f, optsplit=[]): | ||
outls = [] | ||
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") | ||
|
||
unicode.fetch(f) | ||
data = [] | ||
for line in fileinput.input(os.path.basename(f)): | ||
# lines that include a test start with the ÷ character | ||
if len(line) < 2 or line[0:2] != '÷': | ||
continue | ||
|
||
m = testRe1.match(line) | ||
if not m: | ||
print "error: no match on line where test was expected: %s" % line | ||
continue | ||
|
||
# process the characters in this test case | ||
chars = process_split_string(m.group(1)) | ||
# skip test case if it contains invalid characters (viz., surrogates) | ||
if not chars: | ||
continue | ||
|
||
# now process test cases | ||
(chars, info) = process_split_info(m.group(2), chars, optsplit) | ||
|
||
# make sure that we have break info for each break! | ||
assert len(chars) - 1 == len(info) | ||
|
||
outls.append((chars, info)) | ||
|
||
return outls | ||
|
||
def process_split_info(s, c, o): | ||
outcs = [] | ||
outis = [] | ||
workcs = c.pop(0) | ||
|
||
# are we on a × or a ÷? | ||
isX = False | ||
if s[0:2] == '×': | ||
isX = True | ||
|
||
# find each instance of '(÷|×) [x.y] ' | ||
while s: | ||
# find the currently considered rule number | ||
sInd = s.index('[') + 1 | ||
eInd = s.index(']') | ||
|
||
# if it's '× [a.b]' where 'a.b' is in o, then | ||
# we consider it a split even though it's not | ||
# marked as one | ||
# if it's ÷ then it's always a split | ||
if not isX or s[sInd:eInd] in o: | ||
outis.append(s[sInd:eInd]) | ||
outcs.append(workcs) | ||
workcs = c.pop(0) | ||
else: | ||
workcs.extend(c.pop(0)) | ||
|
||
idx = 1 | ||
while idx < len(s): | ||
if s[idx:idx+2] == '×': | ||
isX = True | ||
break | ||
if s[idx:idx+2] == '÷': | ||
isX = False | ||
break | ||
idx += 1 | ||
s = s[idx:] | ||
|
||
outcs.append(workcs) | ||
return (outcs, outis) | ||
|
||
def process_split_string(s): | ||
outls = [] | ||
workls = [] | ||
|
||
inls = s.split() | ||
|
||
for i in inls: | ||
if i == '÷' or i == '×': | ||
outls.append(workls) | ||
workls = [] | ||
continue | ||
|
||
ival = int(i,16) | ||
|
||
if unicode.is_surrogate(ival): | ||
return [] | ||
|
||
workls.append(ival) | ||
|
||
if workls: | ||
outls.append(workls) | ||
|
||
return outls | ||
|
||
def showfun(x): | ||
outstr = '("' | ||
for c in x[0]: | ||
outstr += "\\u{%x}" % c | ||
outstr += '",&[' | ||
xfirst = True | ||
for xx in x[1:]: | ||
if not xfirst: | ||
outstr += '],&[' | ||
xfirst = False | ||
sfirst = True | ||
for sp in xx: | ||
if not sfirst: | ||
outstr += ',' | ||
sfirst = False | ||
outstr += '"' | ||
for c in sp: | ||
outstr += "\\u{%x}" % c | ||
outstr += '"' | ||
outstr += '])' | ||
return outstr | ||
|
||
def create_grapheme_data(): | ||
# rules 9.1 and 9.2 are for extended graphemes only | ||
optsplits = ['9.1','9.2'] | ||
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits) | ||
|
||
test_same = [] | ||
test_diff = [] | ||
|
||
for (c, i) in d: | ||
allchars = [cn for s in c for cn in s] | ||
extgraphs = [] | ||
extwork = [] | ||
|
||
extwork.extend(c[0]) | ||
for n in range(0,len(i)): | ||
if i[n] in optsplits: | ||
extwork.extend(c[n+1]) | ||
else: | ||
extgraphs.append(extwork) | ||
extwork = [] | ||
extwork.extend(c[n+1]) | ||
|
||
# these are the extended grapheme clusters | ||
extgraphs.append(extwork) | ||
|
||
if extgraphs == c: | ||
test_same.append((allchars, c)) | ||
else: | ||
test_diff.append((allchars, extgraphs, c)) | ||
|
||
stype = "&[(&str, &[&str])]" | ||
dtype = "&[(&str, &[&str], &[&str])]" | ||
with open("graph_tests.rs", "w") as rf: | ||
rf.write(" // official Unicode test data\n") | ||
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n") | ||
unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False) | ||
unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False) | ||
|
||
def create_words_data(): | ||
d = load_test_data("auxiliary/WordBreakTest.txt") | ||
|
||
test = [] | ||
|
||
for (c, i) in d: | ||
allchars = [cn for s in c for cn in s] | ||
test.append((allchars, c)) | ||
|
||
wtype = "&[(&str, &[&str])]" | ||
with open("word_tests.rs", "w") as rf: | ||
rf.write(" // official Unicode test data\n") | ||
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n") | ||
unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False) | ||
|
||
create_grapheme_data() | ||
create_words_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.