Skip to content

Commit

Permalink
add implementation of UAX#29 word bounds algorithm
Browse files Browse the repository at this point in the history
This patch does the following:

1. Adds three new structs in libunicode/str.rs:

   a. UnicodeWords: a filter on the UWordBounds iterator that yields only
      the "words" of a string as defined in Section 4 of Unicode Standard
      Annex rust-lang#29 (UAX#29), http://unicode.org/reports/tr29/#Word_Boundaries

   b. UWordBounds: an iterator that segments a string on its word
      boundaries as defined in UAX#29. Note that this *only* segments
      the string, and does *not* drop whitespace and other non-word
      pieces of the text (that's what UnicodeWords does).

      Note that UWordBounds has both a forward and backward iterator
      that have total running time (that is, to segment the entire
      string) linear in the size of the string. It should be noted that
      with pathological inputs the reverse iterator could be about 2x less
      efficient than the forward iterator, but on reasonable inputs
      their costs are similar.

   c. UWordBoundIndices: the above iterator, but returning tuples of
      (offset, &str).

2. Adds three new functions in the `UnicodeStr` trait:

   a. words_unicode(): returns a UnicodeWords iterator.

   b. split_words_uax29(): returns a UWordBounds iterator.

   c. split_words_uax29_indices(): returns a UWordBoundIndices iterator.

3. Updates the `src/etc/unicode.py` script to generate tables necessary
   for running the UWordBounds iterators.

4. Adds a new script, `src/etc/unicode_gen_breaktests.py`,
   which processes the grapheme and word break tests published
   by the Unicode consortium into a format for inclusion in
   libcollectionstest.

5. Adds new impls in libcollections's `str` corresponding to the
   `UnicodeStr` functions of (2).

   Note that this new functionality is gated with `feature(unicode)`.

6. Adds tests in libcollectionstest to exercise this new functionality.

   In addition, updates the test data for the graphemes test to
   correspond to the output from the script of (4). (Note that at the
   moment this change is primarily cosmetic.)

This patch does not settle the question raised by @huonw in rust-lang#15628;
rather, it introduces a new function alongside `words()` that follows
UAX#29.

In addition, it does not address the concerns that @SimonSapin raises in
rust-lang/rfcs#1054 since it leaves `words()`
alone.
  • Loading branch information
kwantam committed Apr 12, 2015
1 parent 6790b0e commit d864250
Show file tree
Hide file tree
Showing 7 changed files with 2,596 additions and 374 deletions.
57 changes: 36 additions & 21 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# - DerivedNormalizationProps.txt
# - EastAsianWidth.txt
# - auxiliary/GraphemeBreakProperty.txt
# - auxiliary/WordBreakProperty.txt
# - PropList.txt
# - ReadMe.txt
# - Scripts.txt
Expand Down Expand Up @@ -290,11 +291,13 @@ def emit_bsearch_range_table(f):
""")

def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
pub_string = ""
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
if not is_const:
pub_string = "let"
if is_pub:
pub_string = "pub "
f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type))
pub_string = "pub " + pub_string
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
Expand Down Expand Up @@ -375,21 +378,25 @@ def emit_conversions_module(f, lowerupper, upperlower):
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
f.write("}\n\n")

def emit_grapheme_module(f, grapheme_table, grapheme_cats):
f.write("""pub mod grapheme {
def emit_break_module(f, break_table, break_cats, name):
Name = name.capitalize()
f.write("""pub mod %s {
use core::slice::SliceExt;
pub use self::GraphemeCat::*;
pub use self::%sCat::*;
use core::result::Result::{Ok, Err};
#[allow(non_camel_case_types)]
#[derive(Clone, Copy)]
pub enum GraphemeCat {
""")
for cat in grapheme_cats + ["Any"]:
f.write(" GC_" + cat + ",\n")
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum %sCat {
""" % (name, Name, Name))

break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand All @@ -400,19 +407,19 @@ def emit_grapheme_module(f, grapheme_table, grapheme_cats):
let (_, _, cat) = r[idx];
cat
}
Err(_) => GC_Any
Err(_) => %sC_Any
}
}
pub fn grapheme_category(c: char) -> GraphemeCat {
bsearch_range_value_table(c, grapheme_cat_table)
pub fn %s_category(c: char) -> %sCat {
bsearch_range_value_table(c, %s_cat_table)
}
""")
""" % (Name, Name, Name[0], name, Name, name))

emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
is_pub=False)
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
is_pub=False, is_const=True)
f.write("}\n")

def emit_charwidth_module(f, width_table):
Expand Down Expand Up @@ -690,4 +697,12 @@ def optimize_width_table(wtable):
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
grapheme_table.sort(key=lambda w: w[0])
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")
196 changes: 196 additions & 0 deletions src/etc/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env python
# -*- coding: utf-8
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# This script uses the following Unicode tables:
# - GraphemeBreakTest.txt
# - WordBreakTest.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import unicode, re, os, fileinput

def load_test_data(f, optsplit=[]):
outls = []
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")

unicode.fetch(f)
data = []
for line in fileinput.input(os.path.basename(f)):
# lines that include a test start with the ÷ character
if len(line) < 2 or line[0:2] != '÷':
continue

m = testRe1.match(line)
if not m:
print "error: no match on line where test was expected: %s" % line
continue

# process the characters in this test case
chars = process_split_string(m.group(1))
# skip test case if it contains invalid characters (viz., surrogates)
if not chars:
continue

# now process test cases
(chars, info) = process_split_info(m.group(2), chars, optsplit)

# make sure that we have break info for each break!
assert len(chars) - 1 == len(info)

outls.append((chars, info))

return outls

def process_split_info(s, c, o):
outcs = []
outis = []
workcs = c.pop(0)

# are we on a × or a ÷?
isX = False
if s[0:2] == '×':
isX = True

# find each instance of '(÷|×) [x.y] '
while s:
# find the currently considered rule number
sInd = s.index('[') + 1
eInd = s.index(']')

# if it's '× [a.b]' where 'a.b' is in o, then
# we consider it a split even though it's not
# marked as one
# if it's ÷ then it's always a split
if not isX or s[sInd:eInd] in o:
outis.append(s[sInd:eInd])
outcs.append(workcs)
workcs = c.pop(0)
else:
workcs.extend(c.pop(0))

idx = 1
while idx < len(s):
if s[idx:idx+2] == '×':
isX = True
break
if s[idx:idx+2] == '÷':
isX = False
break
idx += 1
s = s[idx:]

outcs.append(workcs)
return (outcs, outis)

def process_split_string(s):
outls = []
workls = []

inls = s.split()

for i in inls:
if i == '÷' or i == '×':
outls.append(workls)
workls = []
continue

ival = int(i,16)

if unicode.is_surrogate(ival):
return []

workls.append(ival)

if workls:
outls.append(workls)

return outls

def showfun(x):
outstr = '("'
for c in x[0]:
outstr += "\\u{%x}" % c
outstr += '",&['
xfirst = True
for xx in x[1:]:
if not xfirst:
outstr += '],&['
xfirst = False
sfirst = True
for sp in xx:
if not sfirst:
outstr += ','
sfirst = False
outstr += '"'
for c in sp:
outstr += "\\u{%x}" % c
outstr += '"'
outstr += '])'
return outstr

def create_grapheme_data():
# rules 9.1 and 9.2 are for extended graphemes only
optsplits = ['9.1','9.2']
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)

test_same = []
test_diff = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
extgraphs = []
extwork = []

extwork.extend(c[0])
for n in range(0,len(i)):
if i[n] in optsplits:
extwork.extend(c[n+1])
else:
extgraphs.append(extwork)
extwork = []
extwork.extend(c[n+1])

# these are the extended grapheme clusters
extgraphs.append(extwork)

if extgraphs == c:
test_same.append((allchars, c))
else:
test_diff.append((allchars, extgraphs, c))

stype = "&[(&str, &[&str])]"
dtype = "&[(&str, &[&str], &[&str])]"
with open("graph_tests.rs", "w") as rf:
rf.write(" // official Unicode test data\n")
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)

def create_words_data():
d = load_test_data("auxiliary/WordBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&[(&str, &[&str])]"
with open("word_tests.rs", "w") as rf:
rf.write(" // official Unicode test data\n")
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)

create_grapheme_data()
create_words_data()
64 changes: 64 additions & 0 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ pub use core::str::{MatchIndices, RMatchIndices};
pub use core::str::{from_utf8, Chars, CharIndices, Bytes};
pub use core::str::{from_utf8_unchecked, ParseBoolError};
pub use unicode::str::{Words, Graphemes, GraphemeIndices};
pub use unicode::str::{UnicodeWords, UWordBounds, UWordBoundIndices};
pub use core::str::pattern;

/*
Expand Down Expand Up @@ -1736,6 +1737,30 @@ impl str {
UnicodeStr::words(&self[..])
}

/// An iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// In this function, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
/// # #![feature(unicode, core)]
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uw1 = uws.words_unicode().collect::<Vec<&str>>();
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
///
/// assert_eq!(&uw1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "questions remain regarding the naming of words() and words_unicode()")]
pub fn words_unicode(&self) -> UnicodeWords {
UnicodeStr::words_unicode(&self[..])
}

/// Returns a string's displayed width in columns.
///
/// Control characters have zero width.
Expand Down Expand Up @@ -1819,4 +1844,43 @@ impl str {
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
return s;
}

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # #![feature(unicode, core)]
/// let swu1 = "The quick (\"brown\") fox".split_words_uax29().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "this functionality may only be provided by libunicode")]
pub fn split_words_uax29(&self) -> UWordBounds {
UnicodeStr::split_words_uax29(&self[..])
}

/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
/// and their offsets. See `split_words_uax29()` for more information.
///
/// # Example
///
/// ```
/// # #![feature(unicode, core)]
/// let swi1 = "Brr, it's 29.3°F!".split_words_uax29_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
/// (14, "°"), (16, "F"), (17, "!")];
///
/// assert_eq!(&swi1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "this functionality may only be provided by libunicode")]
pub fn split_words_uax29_indices(&self) -> UWordBoundIndices {
UnicodeStr::split_words_uax29_indices(&self[..])
}
}
Loading

0 comments on commit d864250

Please sign in to comment.