Update default to strip_accents=False (#289)

* update default to * style fixes * updated lowercase default * lowercase tokenizer trainer default * style fixes * updated default * Fixes for WordPieceTrainer (#293) * Changed to temp_dir * Addressed Review Comments * Addressed Reviews * Edits Co-authored-by: Matt Watson <[email protected]> * fixed rebase * style fixes Co-authored-by: Aflah <[email protected]> Co-authored-by: Matt Watson <[email protected]>
keras-team · Aug 11, 2022 · fc01b18 · fc01b18
1 parent 81878ca
commit fc01b18
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 17 deletions.
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -148,9 +148,9 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
             plain text file containing a single WordPiece token per line.
         sequence_length: int. If set, the output will be converted to a dense
             tensor and padded/trimmed so all outputs are of sequence_length.
-        lowercase: bool, defaults to `True`. If true, the input text will be
+        lowercase: bool, defaults to `False`. If true, the input text will be
             lowercased before tokenization.
-        strip_accents: bool, defaults to `True`. If true, all accent marks will
+        strip_accents: bool, defaults to `False`. If true, all accent marks will
             be removed from text before tokenization.
         split: bool, defaults to `True`. If true, input will be split on
             whitespace and punctuation marks, and all punctuation marks will be
@@ -172,15 +172,21 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     Ragged outputs.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
     >>> inputs = ["The quick brown fox."]
-    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab)
+    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    ...     vocabulary=vocab,
+    ...     lowercase=True,
+    ... )
     >>> tokenizer(inputs)
     <tf.RaggedTensor [[1, 2, 3, 4, 5, 6, 7]]>
 
     Dense outputs.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
     >>> inputs = ["The quick brown fox."]
     >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
-    ...     vocabulary=vocab, sequence_length=10)
+    ...     vocabulary=vocab,
+    ...     sequence_length=10,
+    ...     lowercase=True,
+    ... )
     >>> tokenizer(inputs)
     <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
     array([[1, 2, 3, 4, 5, 6, 7, 0, 0, 0]], dtype=int32)>
@@ -189,22 +195,32 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
     >>> inputs = ["The quick brown fox."]
     >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
-    ...     vocabulary=vocab, dtype="string")
+    ...     vocabulary=vocab,
+    ...     lowercase=True,
+    ...     dtype="string",
+    ... )
     >>> tokenizer(inputs)
     <tf.RaggedTensor [[b'the', b'qu', b'##ick', b'br', b'##own', b'fox', b'.']]>
 
     Detokenization.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
     >>> inputs = "The quick brown fox."
-    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab)
+    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    ...     vocabulary=vocab,
+    ...     lowercase=True,
+    ... )
     >>> tokenizer.detokenize(tokenizer.tokenize(inputs)).numpy().decode('utf-8')
     'the quick brown fox .'
 
     Custom splitting.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
     >>> inputs = ["The$quick$brown$fox"]
-    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab,
-    ...     split=False, dtype='string')
+    >>> tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    ...     vocabulary=vocab,
+    ...     split=False,
+    ...     lowercase=True,
+    ...     dtype='string',
+    ... )
     >>> split_inputs = tf.strings.split(inputs, sep="$")
     >>> tokenizer(split_inputs)
     <tf.RaggedTensor [[b'the', b'qu', b'##ick', b'br', b'##own', b'fox']]>
@@ -214,8 +230,8 @@ def __init__(
         self,
         vocabulary=None,
         sequence_length: int = None,
-        lowercase: bool = True,
-        strip_accents: bool = True,
+        lowercase: bool = False,
+        strip_accents: bool = False,
         split: bool = True,
         suffix_indicator: str = "##",
         oov_token: str = "[UNK]",

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -89,7 +89,7 @@ def test_special_tokens(self):
     def test_lowercase(self):
         input_data = ["the QUicK brOWN FOX"]
         vocab_data = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"]
-        tokenizer = WordPieceTokenizer(vocabulary=vocab_data)
+        tokenizer = WordPieceTokenizer(vocabulary=vocab_data, lowercase=True)
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [[1, 2, 3, 4, 5, 6]])
 
@@ -103,7 +103,9 @@ def test_skip_lowercase(self):
     def test_strip_accents(self):
         input_data = ["á é í ó ú"]
         vocab_data = ["[UNK]", "a", "e", "i", "o", "u"]
-        tokenizer = WordPieceTokenizer(vocabulary=vocab_data)
+        tokenizer = WordPieceTokenizer(
+            vocabulary=vocab_data, strip_accents=True
+        )
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [[1, 2, 3, 4, 5]])
 

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py
@@ -25,8 +25,8 @@ def compute_word_piece_vocabulary(
     data,
     vocabulary_size,
     vocabulary_output_file=None,
-    lowercase=True,
-    strip_accents=True,
+    lowercase=False,
+    strip_accents=False,
     split=True,
     suffix_indicator="##",
     reserved_tokens=["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]"],
@@ -45,9 +45,9 @@ def compute_word_piece_vocabulary(
         vocabulary_size: int. The maximum size of a vocabulary to be trained.
         vocabulary_output_file: str, defaults to `None`. The location to write a
             vocabulary file.
-        lowercase: bool, defaults to `True`. If true, the input text will be
+        lowercase: bool, defaults to `False`. If true, the input text will be
             lowercased before tokenization.
-        strip_accents: bool, defaults to `True`. If true, all accent marks will
+        strip_accents: bool, defaults to `False`. If true, all accent marks will
             be removed from text before tokenization.
         split: bool, defaults to `True`. If true, input will be split on
             whitespace and punctuation marks, and all punctuation marks will be
@@ -92,7 +92,7 @@ def compute_word_piece_vocabulary(
     >>> inputs = tf.data.Dataset.from_tensor_slices(["bat sat: pat mat rat.\n"])
     >>> split_inputs = inputs.map(normalize_and_split)
     >>> vocab = compute_word_piece_vocabulary(
-    ...     split_inputs, 13, split=False
+    ...     split_inputs, 13, split=False,
     ... )
     >>> vocab
     ['[PAD]', '[CLS]', '[SEP]', '[UNK]', '[MASK]', 'a', 'b', 'm', 'p', 'r', 's', 't', '##at']

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py
@@ -150,6 +150,7 @@ def test_output_file(self):
             test_text,
             8,
             vocab_file,
+            lowercase=True,
             reserved_tokens=[],
         )
         vocab_from_file = []