Skip to content

Commit

Permalink
Remove the use of SentencePieceTrainer from tests (#1283)
Browse files Browse the repository at this point in the history
* Remove SentencePieceTrainer from keras_nlp/models/albert

* Remove SentencePieceTrainer from keras_nlp/models/deberta_v3

* Remove SentencePieceTrainer from keras_nlp/models/f_net

* Remove SentencePieceTrainer from keras_nlp/models/t5

* Remove SentencePieceTrainer from keras_nlp/models/xlm_roberta

* Remove the .absolute() calls

* Make the bad sentencepiece proto common between all the tests

* Factor missing instances out.

* Address review comments

- Use one proto per model; modify tests accordingly
- Add a comment saying where the test proto file was generated from
- Rename the files from `*_sentencepiece.proto` to `*_test_vocab.spm`
- Rename the bad proto file to `no_special_token_vocab.spm`
- Add a method to get the test dir
- Remove the underscores from the sentencepiece util file
- Save the file in `train_sentencepiece` function itself
- Address the XLM Roberta test failure

* create_bad_proto.py -> create_no_special_token_proto.py

* Update the SentencePieceTokenizer test proto file

* Use os.path.join and resolve XLMRoberta failures

* Fix T5 Tokenizer test failures

* Fix a merge artifact
  • Loading branch information
tirthasheshpatel authored Oct 26, 2023
1 parent bbb4b1e commit d254b02
Show file tree
Hide file tree
Showing 39 changed files with 467 additions and 462 deletions.
29 changes: 8 additions & 21 deletions keras_nlp/models/albert/albert_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
Expand All @@ -27,26 +26,14 @@
class AlbertClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
sequence_length=5,
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
)
)
self.backbone = AlbertBackbone(
vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
Expand Down
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
AlbertMaskedLMPreprocessor,
Expand All @@ -26,24 +25,12 @@

class AlbertMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
28 changes: 8 additions & 20 deletions keras_nlp/models/albert/albert_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
Expand All @@ -29,25 +28,14 @@
class AlbertMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertMaskedLMPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
Expand All @@ -24,24 +23,12 @@

class AlbertPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
Expand Down
43 changes: 13 additions & 30 deletions keras_nlp/models/albert/albert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.tests.test_case import TestCase


class AlbertTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_albert_test_proto.py
"proto": os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
Expand All @@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
AlbertTokenizer(proto=bytes_io.getvalue())
AlbertTokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

@pytest.mark.large
def test_smallest_preset(self):
Expand Down
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
Expand All @@ -31,25 +30,13 @@
class DebertaV3ClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3Preprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
sequence_length=5,
)
self.backbone = DebertaV3Backbone(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
DebertaV3MaskedLMPreprocessor,
Expand All @@ -26,24 +25,12 @@

class DebertaV3MaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
self.tokenizer = DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
Expand All @@ -29,25 +28,13 @@
class DebertaV3MaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3MaskedLMPreprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
Loading

0 comments on commit d254b02

Please sign in to comment.