Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Ranges #19

Merged
merged 1 commit into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions uax29/RangeTokenizer.Test.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
namespace Tests;

using UAX29;
using System.Linq;
using System.Text;

[TestFixture]
public class TestRangeTokenizer
{
[SetUp]
public void Setup()
{
}

[Test]
public void Reset()
{
var example = "Hello, how are you?";
var bytes = Encoding.UTF8.GetBytes(example);

var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;

var first = new List<Range>();
foreach (var range in ranges)
{
first.Add(range);
}

Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing

words.Reset();

var second = new List<Range>();
foreach (var range in ranges)
{
second.Add(range);
}

Assert.That(first.SequenceEqual(second));
}

[Test]
public void SetText()
{
var example = "Hello, how are you?";

var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;

var first = new List<Range>();
foreach (var range in ranges)
{
first.Add(range);
}

Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing

ranges.SetText(example);

var second = new List<Range>();
foreach (var range in ranges)
{
second.Add(range);
}

Assert.That(first.SequenceEqual(second));
}

[Test]
public void MatchesTokenizer()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var tokens = Tokenizer.GetWords(example);
var ranges = tokens.Ranges;

foreach (var range in ranges)
{
tokens.MoveNext();

var ranged = example.AsSpan(range);
var token = tokens.Current;
Assert.That(token.SequenceEqual(ranged));
}
}

[Test]
public void Enumerator()
{
var input = "Hello, how are you?";
var mem = input.AsMemory();
Tokenizer.GetWords(mem);

var words = Tokenizer.GetWords(input);
var ranges = words.Ranges;

var first = new List<Range>();
while (ranges.MoveNext())
{
first.Add(ranges.Current);
}
Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing


var tokens2 = Tokenizer.GetWords(input);
var ranges2 = words.Ranges;

var second = new List<Range>();
foreach (var range in ranges2)
{
second.Add(range);
}
Assert.That(first.SequenceEqual(second));
}

[Test]
public void ToList()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;
var list = ranges.ToList();

var i = 0;
foreach (var range in ranges)
{
Assert.That(range, Is.EqualTo(list[i]));
i++;
}

Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration");

// Tokenizer should reset back to the beginning
Assert.That(ranges.start, Is.EqualTo(0));
Assert.That(ranges.end, Is.EqualTo(0));

var threw = false;
ranges.MoveNext();
try
{
ranges.ToList();
}
catch (InvalidOperationException)
{
threw = true;
}
Assert.That(threw, Is.True, "Calling ToList after iteration has begun should throw");
}

[Test]
public void ToArray()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;
var array = ranges.ToArray();

var i = 0;
foreach (var range in ranges)
{
Assert.That(range, Is.EqualTo(array[i]));
i++;
}

Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration");

// Tokenizer should reset back to the beginning
Assert.That(ranges.start, Is.EqualTo(0));
Assert.That(ranges.end, Is.EqualTo(0));

var threw = false;
ranges.MoveNext();
try
{
ranges.ToArray();
}
catch (InvalidOperationException)
{
threw = true;
}
Assert.That(threw, Is.True, "Calling ToArray after iteration has begun should throw");
}
}
125 changes: 125 additions & 0 deletions uax29/RangeTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
namespace UAX29;

/// <summary>
/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec.
/// </summary>
/// <typeparam name="T">byte or char, indicating the type of the input, and by implication, the output.</typeparam>
public ref struct RangeTokenizer<T> where T : struct
{
ReadOnlySpan<T> input;

readonly Split<T> split;

internal int start = 0;
internal int end = 0;

bool begun = false;

/// <summary>
/// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec.
/// </summary>
/// <param name="input">A string, or UTF-8 byte array.</param>
/// <param name="tokenType">Choose to split words, graphemes or sentences. Default is words.</param>
internal RangeTokenizer(ReadOnlySpan<T> input, Split<T> split)
{
this.input = input;
this.split = split;
}

/// <summary>
/// Move to the next token. Use <see cref="Current"/> to retrieve the token.
/// </summary>
/// <returns>Whether there are any more tokens. False typically means EOF.</returns>
public bool MoveNext()
{
begun = true;

if (end < input.Length)
{
var advance = this.split(input[end..], true);
// Interpret as EOF
if (advance == 0)
{
return false;
}

start = end;
end = start + advance;

return true;
}
return false;
}

/// <summary>
/// The current token (word, grapheme or sentence).
/// If the input was a string, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="char"/>.
/// If the input was UTF-8 bytes, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="byte"/>.
/// </summary>
public readonly Range Current
{
get
{
return new Range(start, end);
}
}

public readonly RangeTokenizer<T> GetEnumerator()
{
return this;
}

/// <summary>
/// Resets the tokenizer back to the first token.
/// </summary>
public void Reset()
{
this.start = 0;
this.end = 0;
this.begun = false;
}

/// <summary>
/// (Re)sets the text to be tokenized, and resets the iterator back to the the start.
/// </summary>
public void SetText(ReadOnlySpan<T> input)
{
Reset();
this.input = input;
}

/// <summary>
/// Iterates over all tokens and collects them into a list, allocating a new array for each token.
/// </summary>
/// <returns>List<byte[]> or List<char[]>, depending on the input</returns>
public List<Range> ToList()
{
if (begun)
{
throw new InvalidOperationException("ToList must not be called after iteration has begun. You may wish to call Reset() on the tokenizer.");
}

var result = new List<Range>();
foreach (var token in this)
{
result.Add(token);
}

this.Reset();
return result;
}

/// <summary>
/// Iterates over all tokens and collects them into an array, allocating a new array for each token.
/// </summary>
/// <returns>byte[][] or char[][], depending on the input</returns>
public Range[] ToArray()
{
if (begun)
{
throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer.");
}

return this.ToList().ToArray();
}
}
17 changes: 17 additions & 0 deletions uax29/Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.Collections.Immutable;

namespace UAX29;

/// <summary>
Expand Down Expand Up @@ -131,4 +133,19 @@ public T[][] ToArray()

return this.ToList().ToArray();
}

/// <summary>
/// Get the ranges (boundaries) of the tokens.
/// </summary>
/// <returns>
/// An enumerator of Range. Use foreach to iterate over the ranges. Apply them to your original input
/// using [range] or .AsSpan(range) to get the tokens.
/// </returns>
public RangeTokenizer<T> Ranges
{
get
{
return new RangeTokenizer<T>(input, split);
}
}
}