Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an add_prefix_space Arg in BytePairTokenizer #715

Merged
merged 6 commits into from
Feb 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions keras_nlp/tokenizers/byte_pair_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ class BytePairTokenizer(tokenizer.Tokenizer):
should have one merge rule per line.
sequence_length: int, defaults to None. If set, the output will be
padded or truncated to the `sequence_length`.
add_prefix_space: bool, defaults to False. Whether or not to add an
initial space to the input. This tokenizer is whitespace aware,
and will tokenize a word with a leading space differently. Adding
a prefix space to the first word will cause it to be tokenized
equivalently to all subsequent words in the sequence.

Examples:

Expand Down Expand Up @@ -230,6 +235,7 @@ def __init__(
vocabulary,
merges,
sequence_length=None,
add_prefix_space=False,
mattdangerw marked this conversation as resolved.
Show resolved Hide resolved
**kwargs,
) -> None:
assert_tf_text_installed(self.__class__.__name__)
Expand Down Expand Up @@ -268,6 +274,7 @@ def __init__(
f"Received: `type(merges)={type(merges)}`"
)
self.sequence_length = sequence_length
self.add_prefix_space = add_prefix_space

# Create byte <=> unicode mapping. This is useful for handling
# whitespace tokens.
Expand Down Expand Up @@ -451,6 +458,9 @@ def tokenize(self, inputs):
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
inputs = tf.convert_to_tensor(inputs)

if self.add_prefix_space:
inputs = tf.strings.join([" ", inputs])

scalar_input = inputs.shape.rank == 0
if scalar_input:
inputs = tf.expand_dims(inputs, 0)
Expand Down
18 changes: 18 additions & 0 deletions keras_nlp/tokenizers/byte_pair_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,24 @@ def test_tokenize_string_output(self):
)
self.assertAllEqual(call_output, expected)

def test_tokenize_prefix_space(self):
input_data = ["brown.", "black."]
tokenizer = BytePairTokenizer(
vocabulary=VOCAB_PATH,
merges=MERGE_PATH,
dtype=tf.string,
add_prefix_space=True,
)
call_output = tokenizer(input_data)

expected = tf.ragged.constant(
[
["Ġbrown", "."],
["Ġblack", "."],
]
)
self.assertAllEqual(call_output, expected)

def test_tokenize_scalar_input(self):
input_data = "brown."
encoded = self.tokenizer.tokenize(input_data)
Expand Down