Skip to content

Commit

Permalink
Update default to strip_accents=False (#289)
Browse files Browse the repository at this point in the history
* update default to

* style fixes

* updated lowercase default

* lowercase tokenizer trainer default

* style fixes

* updated default

* Fixes for WordPieceTrainer (#293)

* Changed to temp_dir

* Addressed Review Comments

* Addressed Reviews

* Edits

Co-authored-by: Matt Watson <[email protected]>

* fixed rebase

* style fixes

Co-authored-by: Aflah <[email protected]>
Co-authored-by: Matt Watson <[email protected]>
  • Loading branch information
3 people authored Aug 11, 2022
1 parent 81878ca commit fc01b18
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 17 deletions.
36 changes: 26 additions & 10 deletions keras_nlp/tokenizers/word_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
plain text file containing a single WordPiece token per line.
sequence_length: int. If set, the output will be converted to a dense
tensor and padded/trimmed so all outputs are of sequence_length.
lowercase: bool, defaults to `True`. If true, the input text will be
lowercase: bool, defaults to `False`. If true, the input text will be
lowercased before tokenization.
strip_accents: bool, defaults to `True`. If true, all accent marks will
strip_accents: bool, defaults to `False`. If true, all accent marks will
be removed from text before tokenization.
split: bool, defaults to `True`. If true, input will be split on
whitespace and punctuation marks, and all punctuation marks will be
Expand All @@ -172,15 +172,21 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
Ragged outputs.
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
>>> inputs = ["The quick brown fox."]
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab)
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
... vocabulary=vocab,
... lowercase=True,
... )
>>> tokenizer(inputs)
<tf.RaggedTensor [[1, 2, 3, 4, 5, 6, 7]]>
Dense outputs.
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
>>> inputs = ["The quick brown fox."]
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
... vocabulary=vocab, sequence_length=10)
... vocabulary=vocab,
... sequence_length=10,
... lowercase=True,
... )
>>> tokenizer(inputs)
<tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[1, 2, 3, 4, 5, 6, 7, 0, 0, 0]], dtype=int32)>
Expand All @@ -189,22 +195,32 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
>>> inputs = ["The quick brown fox."]
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
... vocabulary=vocab, dtype="string")
... vocabulary=vocab,
... lowercase=True,
... dtype="string",
... )
>>> tokenizer(inputs)
<tf.RaggedTensor [[b'the', b'qu', b'##ick', b'br', b'##own', b'fox', b'.']]>
Detokenization.
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
>>> inputs = "The quick brown fox."
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab)
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
... vocabulary=vocab,
... lowercase=True,
... )
>>> tokenizer.detokenize(tokenizer.tokenize(inputs)).numpy().decode('utf-8')
'the quick brown fox .'
Custom splitting.
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
>>> inputs = ["The$quick$brown$fox"]
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab,
... split=False, dtype='string')
>>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
... vocabulary=vocab,
... split=False,
... lowercase=True,
... dtype='string',
... )
>>> split_inputs = tf.strings.split(inputs, sep="$")
>>> tokenizer(split_inputs)
<tf.RaggedTensor [[b'the', b'qu', b'##ick', b'br', b'##own', b'fox']]>
Expand All @@ -214,8 +230,8 @@ def __init__(
self,
vocabulary=None,
sequence_length: int = None,
lowercase: bool = True,
strip_accents: bool = True,
lowercase: bool = False,
strip_accents: bool = False,
split: bool = True,
suffix_indicator: str = "##",
oov_token: str = "[UNK]",
Expand Down
6 changes: 4 additions & 2 deletions keras_nlp/tokenizers/word_piece_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_special_tokens(self):
def test_lowercase(self):
input_data = ["the QUicK brOWN FOX"]
vocab_data = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"]
tokenizer = WordPieceTokenizer(vocabulary=vocab_data)
tokenizer = WordPieceTokenizer(vocabulary=vocab_data, lowercase=True)
call_output = tokenizer(input_data)
self.assertAllEqual(call_output, [[1, 2, 3, 4, 5, 6]])

Expand All @@ -103,7 +103,9 @@ def test_skip_lowercase(self):
def test_strip_accents(self):
input_data = ["á é í ó ú"]
vocab_data = ["[UNK]", "a", "e", "i", "o", "u"]
tokenizer = WordPieceTokenizer(vocabulary=vocab_data)
tokenizer = WordPieceTokenizer(
vocabulary=vocab_data, strip_accents=True
)
call_output = tokenizer(input_data)
self.assertAllEqual(call_output, [[1, 2, 3, 4, 5]])

Expand Down
10 changes: 5 additions & 5 deletions keras_nlp/tokenizers/word_piece_tokenizer_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def compute_word_piece_vocabulary(
data,
vocabulary_size,
vocabulary_output_file=None,
lowercase=True,
strip_accents=True,
lowercase=False,
strip_accents=False,
split=True,
suffix_indicator="##",
reserved_tokens=["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]"],
Expand All @@ -45,9 +45,9 @@ def compute_word_piece_vocabulary(
vocabulary_size: int. The maximum size of a vocabulary to be trained.
vocabulary_output_file: str, defaults to `None`. The location to write a
vocabulary file.
lowercase: bool, defaults to `True`. If true, the input text will be
lowercase: bool, defaults to `False`. If true, the input text will be
lowercased before tokenization.
strip_accents: bool, defaults to `True`. If true, all accent marks will
strip_accents: bool, defaults to `False`. If true, all accent marks will
be removed from text before tokenization.
split: bool, defaults to `True`. If true, input will be split on
whitespace and punctuation marks, and all punctuation marks will be
Expand Down Expand Up @@ -92,7 +92,7 @@ def compute_word_piece_vocabulary(
>>> inputs = tf.data.Dataset.from_tensor_slices(["bat sat: pat mat rat.\n"])
>>> split_inputs = inputs.map(normalize_and_split)
>>> vocab = compute_word_piece_vocabulary(
... split_inputs, 13, split=False
... split_inputs, 13, split=False,
... )
>>> vocab
['[PAD]', '[CLS]', '[SEP]', '[UNK]', '[MASK]', 'a', 'b', 'm', 'p', 'r', 's', 't', '##at']
Expand Down
1 change: 1 addition & 0 deletions keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def test_output_file(self):
test_text,
8,
vocab_file,
lowercase=True,
reserved_tokens=[],
)
vocab_from_file = []
Expand Down

0 comments on commit fc01b18

Please sign in to comment.