keras-team · mattdangerw · Feb 24, 2023 · Feb 18, 2023 · Feb 18, 2023 · Feb 18, 2023
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
@@ -52,7 +52,9 @@ def setUp(self):
             unk_piece="[UNK]",
         )
         self.preprocessor = DebertaV3Preprocessor(
-            tokenizer=DebertaV3Tokenizer(proto=bytes_io.getvalue()),
+            tokenizer=DebertaV3Tokenizer(
+                proto=bytes_io.getvalue(), mask_token_id=10
+            ),
             sequence_length=12,
         )
         self.backbone = DebertaV3Backbone(

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py
@@ -51,7 +51,7 @@ def setUp(self):
         self.proto = bytes_io.getvalue()
 
         self.preprocessor = DebertaV3Preprocessor(
-            tokenizer=DebertaV3Tokenizer(proto=self.proto),
+            tokenizer=DebertaV3Tokenizer(proto=self.proto, mask_token_id=10),
             sequence_length=12,
         )
 

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_presets.py b/keras_nlp/models/deberta_v3/deberta_v3_presets.py
@@ -25,7 +25,9 @@
             "max_sequence_length": 512,
             "bucket_size": 256,
         },
-        "preprocessor_config": {},
+        "preprocessor_config": {
+            "mask_token_id": 128000,
+        },
         "metadata": {
             "description": (
                 "12-layer DeBERTaV3 model where case is maintained. "
@@ -51,7 +53,9 @@
             "max_sequence_length": 512,
             "bucket_size": 256,
         },
-        "preprocessor_config": {},
+        "preprocessor_config": {
+            "mask_token_id": 128000,
+        },
         "metadata": {
             "description": (
                 "6-layer DeBERTaV3 model where case is maintained. "
@@ -77,7 +81,9 @@
             "max_sequence_length": 512,
             "bucket_size": 256,
         },
-        "preprocessor_config": {},
+        "preprocessor_config": {
+            "mask_token_id": 128000,
+        },
         "metadata": {
             "description": (
                 "12-layer DeBERTaV3 model where case is maintained. "
@@ -103,7 +109,9 @@
             "max_sequence_length": 512,
             "bucket_size": 256,
         },
-        "preprocessor_config": {},
+        "preprocessor_config": {
+            "mask_token_id": 128000,
+        },
         "metadata": {
             "description": (
                 "24-layer DeBERTaV3 model where case is maintained. "
@@ -129,7 +137,9 @@
             "max_sequence_length": 512,
             "bucket_size": 256,
         },
-        "preprocessor_config": {},
+        "preprocessor_config": {
+            "mask_token_id": 250101,
+        },
         "metadata": {
             "description": (
                 "12-layer DeBERTaV3 model where case is maintained. "

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_presets_test.py b/keras_nlp/models/deberta_v3/deberta_v3_presets_test.py
@@ -53,6 +53,14 @@ def test_preprocessor_output(self):
         expected_outputs = [1, 279, 1538, 2]
         self.assertAllEqual(outputs, expected_outputs)
 
+    def test_preprocessor_mask_token(self):
+        preprocessor = DebertaV3Preprocessor.from_preset(
+            "deberta_v3_extra_small_en",
+            sequence_length=4,
+        )
+        self.assertEqual(preprocessor.tokenizer.id_to_token(128000), "[MASK]")
+        self.assertEqual(preprocessor.tokenizer.token_to_id("[MASK]"), 128000)
+
     @parameterized.named_parameters(
         ("preset_weights", True), ("random_weights", False)
     )

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py
@@ -48,6 +48,11 @@ class DebertaV3Tokenizer(SentencePieceTokenizer):
             `bytes` object with a serialized SentencePiece proto. See the
             [SentencePiece repository](https://github.com/google/sentencepiece)
             for more details on the format.
+        mask_token_id: The token ID (int) of the mask token (`[MASK]`). If
+            `None`, the SentencePiece vocabulary is expected to have the mask
+            token. Preset DeBERTa vocabularies do not have the mask token in the
+            provided vocabulary files, which is why this workaround is
+            necessary.
 
     Examples:
 
@@ -65,15 +70,29 @@ class DebertaV3Tokenizer(SentencePieceTokenizer):
     ```
     """
 
-    def __init__(self, proto, **kwargs):
+    def __init__(self, proto, mask_token_id=None, **kwargs):
         super().__init__(proto=proto, **kwargs)
 
+        # Maintain a private copy of `mask_token_id` for config purposes.
+        self._mask_token_id = mask_token_id
+
+        # Maintain a private copy of the original vocabulary; the parent class's
+        # `get_vocabulary()` function calls `self.vocabulary_size()`, which
+        # throws up a segmentation fault.
+        self._original_vocabulary = super().get_vocabulary()
+
         # Check for necessary special tokens.
         cls_token = "[CLS]"
         sep_token = "[SEP]"
         pad_token = "[PAD]"
-        for token in [cls_token, pad_token, sep_token]:
-            if token not in self.get_vocabulary():
+        mask_token = "[MASK]"
+
+        in_vocab_special_tokens = [cls_token, pad_token, sep_token]
+        if mask_token_id is None:
+            in_vocab_special_tokens = in_vocab_special_tokens + [mask_token]
+
+        for token in in_vocab_special_tokens:
+            if token not in self._original_vocabulary:
                 raise ValueError(
                     f"Cannot find token `'{token}'` in the provided "
                     f"`vocabulary`. Please provide `'{token}'` in your "
@@ -83,6 +102,49 @@ def __init__(self, proto, **kwargs):
         self.cls_token_id = self.token_to_id(cls_token)
         self.sep_token_id = self.token_to_id(sep_token)
         self.pad_token_id = self.token_to_id(pad_token)
+        self.mask_token_id = mask_token_id
+        if mask_token_id is None:
+            self.mask_token_id = self.token_to_id(mask_token)
+
+    def vocabulary_size(self):
+        vocabulary_size = super().vocabulary_size()
+
+        # This is to avoid an error when `super.get_vocabulary()` is called
+        # in `__init__()`.
+        if not hasattr(self, "mask_token_id"):
+            return vocabulary_size
+
+        if self.mask_token_id >= vocabulary_size:
+            return self.mask_token_id + 1
+        return vocabulary_size
+
+    def get_vocabulary(self):
+        vocabulary = self._original_vocabulary
+        if self.mask_token_id >= len(vocabulary):
+            vocabulary = vocabulary + [None] * (
+                self.mask_token_id - len(vocabulary) + 1
+            )
+        vocabulary[self.mask_token_id] = "[MASK]"
+        return vocabulary
+
+    def id_to_token(self, id):
+        if id == self.mask_token_id:
+            return "[MASK]"
+        return super().id_to_token(id)
+
+    def token_to_id(self, token):
+        if token == "[MASK]":
+            return self.mask_token_id
+        return int(self._sentence_piece.string_to_id(token).numpy())
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "mask_token_id": self._mask_token_id,
+            }
+        )
+        return config
 
     @classproperty
     def presets(cls):

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py
@@ -47,7 +47,7 @@ def setUp(self):
         )
         self.proto = bytes_io.getvalue()
 
-        self.tokenizer = DebertaV3Tokenizer(proto=self.proto)
+        self.tokenizer = DebertaV3Tokenizer(proto=self.proto, mask_token_id=10)
 
     def test_tokenize(self):
         input_data = "the quick brown fox"
@@ -65,7 +65,16 @@ def test_detokenize(self):
         self.assertEqual(output, tf.constant(["the quick brown fox"]))
 
     def test_vocabulary_size(self):
-        self.assertEqual(self.tokenizer.vocabulary_size(), 10)
+        self.assertEqual(self.tokenizer.vocabulary_size(), 11)
+
+    def test_get_vocabulary(self):
+        self.assertEqual(self.tokenizer.get_vocabulary()[10], "[MASK]")
+
+    def test_id_to_token(self):
+        self.assertEqual(self.tokenizer.id_to_token(10), "[MASK]")
+
+    def test_token_to_id(self):
+        self.assertEqual(self.tokenizer.token_to_id("[MASK]"), 10)
 
     def test_errors_missing_special_tokens(self):
         bytes_io = io.BytesIO()

diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer.py
@@ -159,7 +159,7 @@ def vocabulary_size(self) -> int:
         return int(self._sentence_piece.vocab_size().numpy())
 
     def get_vocabulary(self) -> List[str]:
-        """Get the size of the tokenizer vocabulary."""
+        """Get the tokenizer vocabulary."""
         return tensor_to_string_list(
             self._sentence_piece.id_to_string(tf.range(self.vocabulary_size()))
         )