Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a UnicodeCharacterTokenizer #100

Merged
merged 42 commits into from
Apr 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
f04325c
Debugging
aflah02 Apr 8, 2022
20db313
Debugging
aflah02 Apr 8, 2022
e0b6c44
Fixed Sequence Length Issue
aflah02 Apr 8, 2022
f949f5a
Sequence Length Changes
aflah02 Apr 8, 2022
ac2bb89
Removed _ From Class Attributes
aflah02 Apr 8, 2022
1d1a1a2
Fixed Null Bytes in Detokenization
aflah02 Apr 8, 2022
ef1b5b6
Testing regex_replace
aflah02 Apr 8, 2022
0de3153
Testing
aflah02 Apr 8, 2022
161e316
Helper Function and Debug Statements
aflah02 Apr 8, 2022
8054855
Testing Regex Replace New Ordering
aflah02 Apr 8, 2022
d638260
Added Checks for Errors and Normalization Form
aflah02 Apr 8, 2022
5fad8ad
Doc String Completed
aflah02 Apr 8, 2022
a6b095f
Ran lint/format
aflah02 Apr 8, 2022
c45de16
New Tests and Decoding Changes
aflah02 Apr 8, 2022
78f4da7
Changes
aflah02 Apr 8, 2022
927fdc6
Minor Tweak
aflah02 Apr 8, 2022
68830d4
Tweaking Detokenizer
aflah02 Apr 8, 2022
7137c39
Added Tests and Updated Docstrings
aflah02 Apr 8, 2022
8cd02d2
Ran format.sh and lint.sh
aflah02 Apr 8, 2022
11e5eed
Refactoring and Removing Unused Lines
aflah02 Apr 12, 2022
09f5f30
Fixed Some Broken Tests
aflah02 Apr 12, 2022
91c06af
Fixed All Tests
aflah02 Apr 12, 2022
24fb3ac
Testing Decode
aflah02 Apr 12, 2022
2ded9a7
Testing
aflah02 Apr 12, 2022
82ee48c
Debug
aflah02 Apr 12, 2022
43c33c8
Fixes + Replaced Regex with BooleanMask
aflah02 Apr 13, 2022
0731294
Added Debug Lines
aflah02 Apr 13, 2022
4da8739
Added Debug Line for .numpy()
aflah02 Apr 13, 2022
996fd25
Testing Byte Tokenizer Approach
aflah02 Apr 13, 2022
44b01f7
Testing With Unicode_transcode
aflah02 Apr 13, 2022
d3fe320
Listing Methods of Object
aflah02 Apr 13, 2022
aaf9454
Testing _numpy
aflah02 Apr 13, 2022
1046798
Added Decode Call
aflah02 Apr 13, 2022
fa8eeea
Checking Methods post _numpy()
aflah02 Apr 13, 2022
b47806a
Removed Debug Statements and Improved Docstring
aflah02 Apr 13, 2022
9d1514f
Fixed Failing Test
aflah02 Apr 13, 2022
1ec59df
Ran format/lint
aflah02 Apr 13, 2022
ba76dcc
Fixed Docstring and Improved Examples
aflah02 Apr 15, 2022
ac55c10
Merge branch 'keras-team:master' into master
aflah02 Apr 15, 2022
96cf050
Ran format and lint
aflah02 Apr 15, 2022
a915c3d
Copy edits
mattdangerw Apr 16, 2022
053375d
Copy edits
mattdangerw Apr 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions keras_nlp/tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@

from keras_nlp.tokenizers.byte_tokenizer import ByteTokenizer
from keras_nlp.tokenizers.tokenizer import Tokenizer
from keras_nlp.tokenizers.unicode_character_tokenizer import (
UnicodeCharacterTokenizer,
)
from keras_nlp.tokenizers.word_piece_tokenizer import WordPieceTokenizer
4 changes: 2 additions & 2 deletions keras_nlp/tokenizers/byte_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
>>> ds.take(1).get_single_element()
<tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>

Batch up the inputs and then tokenize.
Batch the inputs and then tokenize.
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
>>> ds = ds.batch(2).map(tokenizer)
Expand All @@ -114,7 +114,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
array([[104, 101, 108, 108, 111],
[102, 117, 110, 0, 0]])>

Batch up the inputs and then tokenize (`sequence_length` provided).
Batch the inputs and then tokenize (`sequence_length` provided).
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
>>> ds = ds.batch(2).map(tokenizer)
Expand Down
272 changes: 272 additions & 0 deletions keras_nlp/tokenizers/unicode_character_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
# Copyright 2022 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any
from typing import Dict

import tensorflow as tf
import tensorflow_text as tf_text

from keras_nlp.tokenizers import tokenizer


class UnicodeCharacterTokenizer(tokenizer.Tokenizer):
"""A unicode character tokenizer layer.

This tokenizer is a vocabulary free tokenizer which tokenizes text as
unicode characters codepoints.

Args:
lowercase: If true, the input text will be first lowered before
tokenization.
sequence_length: If set, the output will be converted to a dense
tensor and padded/trimmed so all outputs are of sequence_length.
normalization_form: One of the following string values (None, 'NFC',
'NFKC', 'NFD', 'NFKD'). If set will normalize unicode to the given
form before tokenizing.
errors: One of ('replace', 'remove', 'strict'). Specifies the
`detokenize()` behavior when an invalid codepoint is encountered.
(same behavior as
https://www.tensorflow.org/api_docs/python/tf/strings/unicode_transcode)
replacement_char: The unicode codepoint to use in place of invalid
codepoints. Defaults to 65533 (U+FFFD).
input_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
One of The encoding of the input text. Defaults to "UTF-8".
output_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
The encoding of the output text. Defaults to "UTF-8".

Examples:

Basic Usage.
>>> inputs = "Unicode Tokenizer"
aflah02 marked this conversation as resolved.
Show resolved Hide resolved
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
>>> tokenizer(inputs)
<tf.Tensor: shape=(17,), dtype=int32, numpy=
array([117, 110, 105, 99, 111, 100, 101, 32, 116, 111, 107, 101, 110,
105, 122, 101, 114], dtype=int32)>

Ragged outputs.
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
>>> tokenizer(inputs)
<tf.RaggedTensor [[98, 111, 111, 107],
[2346, 2369, 2360, 2381, 2340, 2325],
[1705, 1578, 1575, 1576]]>

Dense outputs.
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... sequence_length=8)
>>> tokenizer(inputs)
<tf.Tensor: shape=(3, 8), dtype=int32, numpy=
array([[ 98, 111, 111, 107, 0, 0, 0, 0],
[2346, 2369, 2360, 2381, 2340, 2325, 0, 0],
[1705, 1578, 1575, 1576, 0, 0, 0, 0]], dtype=int32)>

Tokenize first, then batch the dataset.
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
>>> ds = ds.map(tokenizer)
>>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
>>> ds.take(1).get_single_element()
<tf.RaggedTensor [[98, 111, 111, 107],
[2346, 2369, 2360, 2381, 2340, 2325],
[1705, 1578, 1575, 1576]]>

Batch the inputs and then tokenize.
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
>>> ds = ds.batch(3).map(tokenizer)
>>> ds.take(1).get_single_element()
<tf.RaggedTensor [[98, 111, 111, 107],
[2346, 2369, 2360, 2381, 2340, 2325],
[1705, 1578, 1575, 1576]]>

Tokenize first, then batch for dense outputs (`sequence_length` provided).
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... sequence_length=5)
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
>>> ds = ds.map(tokenizer)
>>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
>>> ds.take(1).get_single_element()
<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[ 98, 111, 111, 107, 0],
[2346, 2369, 2360, 2381, 2340],
[1705, 1578, 1575, 1576, 0]], dtype=int32)>

Batch first, then tokenize for dense outputs (`sequence_length` provided).
(`sequence_length` provided).
>>> inputs = ["Book", "पुस्तक", "کتاب"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... sequence_length=5)
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
>>> ds = ds.batch(3).map(tokenizer)
>>> ds.take(1).get_single_element()
<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[ 98, 111, 111, 107, 0],
[2346, 2369, 2360, 2381, 2340],
[1705, 1578, 1575, 1576, 0]], dtype=int32)>

Tokenization showcasing truncation of long sequences.
>>> inputs = ["I Like to Travel a Lot", "मैं किताबें पढ़ना पसंद करता हूं"]
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... sequence_length=5)
>>> tokenizer(inputs)
<tf.Tensor: shape=(5,), dtype=int32,
numpy=array([[ 105, 32, 108, 105, 107],
[2350, 2376, 2306, 32, 2325]], dtype=int32)>

Detokenization.
>>> inputs = tf.constant([110, 105, 110, 106, 97], dtype=tf.int32)
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
>>> tokenizer.detokenize(inputs)
<tf.Tensor: shape=(), dtype=string, numpy=b'ninja'>

Detokenization while showcasing padded characters being removed
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... sequence_length=7)
>>> dataset = tf.data.Dataset.from_tensor_slices(["a b c", "b c", "a"])
>>> dataset = dataset.map(tokenizer)
>>> dataset.take(1).get_single_element()
<tf.Tensor: shape=(7,), dtype=int32,
numpy=array([97, 32, 98, 32, 99, 0, 0], dtype=int32)>
>>> detokunbatched = dataset.map(tokenizer.detokenize)
>>> detokunbatched = dataset.map(tokenizer.detokenize)
>>> detokunbatched.take(1).get_single_element()
<tf.Tensor: shape=(), dtype=string, numpy=b'a b c'>

Detokenization with invalid bytes.
>>> # The 10000000 in the inputs tensor below is an invalid value
>>> # Hence it replaces to the replacement_char 75 which represents 'K'
>>> inputs = tf.constant([110, 105, 10000000, 110, 106, 97])
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... errors="replace", replacement_char=75)
>>> tokenizer.detokenize(inputs).numpy().decode('utf-8')
'niKnja'
"""

def __init__(
self,
sequence_length: int = None,
lowercase: bool = True,
normalization_form: str = None,
errors: str = "replace",
replacement_char: int = 65533,
input_encoding: str = "UTF-8",
output_encoding: str = "UTF-8",
**kwargs,
) -> None:
# Check dtype and provide a default.
if "dtype" not in kwargs or kwargs["dtype"] is None:
kwargs["dtype"] = tf.int32
else:
dtype = tf.dtypes.as_dtype(kwargs["dtype"])
if not dtype.is_integer and dtype != tf.string:
raise ValueError(
"Output dtype must be an integer type of a string. "
f"Received: dtype={dtype}"
)

# Check normalization_form.
if normalization_form not in [None, "NFC", "NFKC", "NFD", "NFKD"]:
raise ValueError(
'`normalization_form` must be one of None, "NFC", "NFKC", '
'"NFD", "NFKD". Received: normalization_form='
f"{normalization_form}"
)

# Check errors.
if errors not in ["strict", "replace", "ignore"]:
raise ValueError(
'`errors` must be one of "strict", "replace", "ignore" '
f"Received: errors={errors}"
)

# Check normalization_form matches input_encoding.
if normalization_form:
if input_encoding != "UTF-8":
raise ValueError(
"""Normalization Forms are Only Supported for Input Encoding
UTF-8"""
)

super().__init__(**kwargs)

self.sequence_length = sequence_length
self.lowercase = lowercase
self.normalization_form = normalization_form
self.errors = errors
self.replacement_char = replacement_char
self.input_encoding = input_encoding
self.output_encoding = output_encoding

def get_config(self) -> Dict[str, Any]:
config = super().get_config()
config.update(
{
"sequence_length": self.sequence_length,
"lowercase": self.lowercase,
"normalization_form": self.normalization_form,
"errors": self.errors,
"replacement_char": self.replacement_char,
"input_encoding": self.input_encoding,
"output_encoding": self.output_encoding,
}
)
return config

def tokenize(self, inputs):
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
inputs = tf.convert_to_tensor(inputs)

scalar_input = inputs.shape.rank == 0
if scalar_input:
inputs = tf.expand_dims(inputs, 0)

# Optionally lowercase the text
if self.lowercase:
inputs = tf_text.case_fold_utf8(inputs)

# Optionally normalize the text to a given form
if self.normalization_form:
inputs = tf_text.normalize_utf8(inputs, self.normalization_form)

tokens = tf.strings.unicode_decode(
inputs,
errors=self.errors,
replacement_char=self.replacement_char,
input_encoding=self.input_encoding,
)

if self.sequence_length:
output_shape = tokens.shape.as_list()
output_shape[-1] = self.sequence_length
tokens = tokens.to_tensor(shape=output_shape)

if scalar_input:
tokens = tf.squeeze(tokens, 0)
return tokens

def detokenize(self, inputs):
inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
encoded_string = tf.strings.unicode_encode(
inputs,
errors=self.errors,
replacement_char=self.replacement_char,
output_encoding=self.output_encoding,
)
return encoded_string
Loading