Remove the use of SentencePieceTrainer from tests (#1283)

* Remove SentencePieceTrainer from keras_nlp/models/albert * Remove SentencePieceTrainer from keras_nlp/models/deberta_v3 * Remove SentencePieceTrainer from keras_nlp/models/f_net * Remove SentencePieceTrainer from keras_nlp/models/t5 * Remove SentencePieceTrainer from keras_nlp/models/xlm_roberta * Remove the .absolute() calls * Make the bad sentencepiece proto common between all the tests * Factor missing instances out. * Address review comments - Use one proto per model; modify tests accordingly - Add a comment saying where the test proto file was generated from - Rename the files from `*_sentencepiece.proto` to `*_test_vocab.spm` - Rename the bad proto file to `no_special_token_vocab.spm` - Add a method to get the test dir - Remove the underscores from the sentencepiece util file - Save the file in `train_sentencepiece` function itself - Address the XLM Roberta test failure * create_bad_proto.py -> create_no_special_token_proto.py * Update the SentencePieceTokenizer test proto file * Use os.path.join and resolve XLMRoberta failures * Fix T5 Tokenizer test failures * Fix a merge artifact
keras-team · Oct 26, 2023 · d254b02 · d254b02
1 parent bbb4b1e
commit d254b02
Show file tree

Hide file tree

Showing 39 changed files with 467 additions and 462 deletions.
diff --git a/keras_nlp/models/albert/albert_classifier_test.py b/keras_nlp/models/albert/albert_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_backbone import AlbertBackbone
 from keras_nlp.models.albert.albert_classifier import AlbertClassifier
@@ -27,26 +26,14 @@
 class AlbertClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = AlbertPreprocessor(
-            AlbertTokenizer(proto=bytes_io.getvalue()),
-            sequence_length=5,
+            AlbertTokenizer(
+                # Generated using create_albert_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "albert_test_vocab.spm"
+                ),
+                sequence_length=5,
+            )
         )
         self.backbone = AlbertBackbone(
             vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),

diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
     AlbertMaskedLMPreprocessor,
@@ -26,24 +25,12 @@
 
 class AlbertMaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = AlbertTokenizer(
+            # Generated using create_albert_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
         )
-        self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.

diff --git a/keras_nlp/models/albert/albert_masked_lm_test.py b/keras_nlp/models/albert/albert_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_backbone import AlbertBackbone
 from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
@@ -29,25 +28,14 @@
 class AlbertMaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = AlbertMaskedLMPreprocessor(
-            AlbertTokenizer(proto=bytes_io.getvalue()),
+            AlbertTokenizer(
+                # Generated using create_albert_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "albert_test_vocab.spm"
+                ),
+                sequence_length=5,
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,

diff --git a/keras_nlp/models/albert/albert_preprocessor_test.py b/keras_nlp/models/albert/albert_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
 from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
@@ -24,24 +23,12 @@
 
 class AlbertPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = AlbertTokenizer(
+            # Generated using create_albert_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
         )
-        self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             "sequence_length": 8,

diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py
@@ -12,35 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
 from keras_nlp.tests.test_case import TestCase
 
 
 class AlbertTokenizerTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.init_kwargs = {
+            # Generated using create_albert_test_proto.py
+            "proto": os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
+        }
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
         )
 
     def test_errors_missing_special_tokens(self):
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(["abc"]),
-            model_writer=bytes_io,
-            vocab_size=5,
-            pad_id=-1,
-            eos_id=-1,
-            bos_id=-1,
-        )
         with self.assertRaises(ValueError):
-            AlbertTokenizer(proto=bytes_io.getvalue())
+            AlbertTokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
 
     @pytest.mark.large
     def test_smallest_preset(self):

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
 from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
@@ -31,25 +30,13 @@
 class DebertaV3ClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = DebertaV3Preprocessor(
-            DebertaV3Tokenizer(proto=bytes_io.getvalue()),
+            DebertaV3Tokenizer(
+                # Generated using create_deberta_v3_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+                )
+            ),
             sequence_length=5,
         )
         self.backbone = DebertaV3Backbone(

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
     DebertaV3MaskedLMPreprocessor,
@@ -26,24 +25,12 @@
 
 class DebertaV3MaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = DebertaV3Tokenizer(
+            # Generated using create_deberta_v3_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+            )
         )
-        self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
 from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
@@ -29,25 +28,13 @@
 class DebertaV3MaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = DebertaV3MaskedLMPreprocessor(
-            DebertaV3Tokenizer(proto=bytes_io.getvalue()),
+            DebertaV3Tokenizer(
+                # Generated using create_deberta_v3_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+                )
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,