Enable option for subword regularization in XLMRobertaTokenizer (#1…

…1149) * enable subword regularization. * fix tokenizer storage * fix docstring formatting * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Stefan Schweter <[email protected]> * fix docstring formatting * add test for subword regularization tokenizer * improve comments of test * add sp_model_kwargs * reformat docstring to match the style * add some more documentation * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <[email protected]> * improve docstring * empty commit to trigger CI * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <[email protected]> * fix docstring formatting for sphinx Co-authored-by: Stefan Schweter <[email protected]> Co-authored-by: Sylvain Gugger <[email protected]>
huggingface · Apr 23, 2021 · 195bfd1 · 195bfd1
1 parent 1ef152e
commit 195bfd1
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 2 deletions.
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -94,6 +94,20 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
             modeling. This is the token which the model will try to predict.
         additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
+        sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`):
+            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
+            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+
+            - ``enable_sampling``: Enable subword regularization.
+            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - ``nbest_size = {0,1}``: No sampling is performed.
+              - ``nbest_size > 1``: samples from the nbest_size results.
+              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
 
     Attributes:
         sp_model (:obj:`SentencePieceProcessor`):
@@ -115,11 +129,14 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
+        sp_model_kwargs=None,
         **kwargs
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
+        sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -128,10 +145,11 @@ def __init__(
             cls_token=cls_token,
             pad_token=pad_token,
             mask_token=mask_token,
+            sp_model_kwargs=sp_model_kwargs,
             **kwargs,
         )
 
-        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model = spm.SentencePieceProcessor(**sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
 
@@ -249,7 +267,7 @@ def get_vocab(self):
         return vocab
 
     def _tokenize(self, text):
-        return self.sp_model.EncodeAsPieces(text)
+        return self.sp_model.encode(text, out_type=str)
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str) in an id using the vocab. """

diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 
+import itertools
 import os
 import unittest
 
@@ -118,6 +119,29 @@ def test_full_tokenizer(self):
             ],
         )
 
+    def test_subword_regularization_tokenizer(self):
+        # Subword regularization is only available for the slow tokenizer.
+        tokenizer = XLMRobertaTokenizer(
+            SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        )
+
+        # Subword regularization augments training data with subword sampling.
+        # This has a random component. We test if the tokenizer generates different
+        # results when subword regularization is enabled.
+        tokens_list = []
+        for _ in range(5):
+            tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
+
+        # the list of different pairs of tokens_list
+        combinations = itertools.combinations(tokens_list, 2)
+
+        all_equal = True
+        for combination in combinations:
+            if combination[0] != combination[1]:
+                all_equal = False
+
+        self.assertFalse(all_equal)
+
     @cached_property
     def big_tokenizer(self):
         return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")