huggingface · ArthurZucker · Sep 18, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -33,6 +33,15 @@
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
 
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+
 
 class LlamaTokenizerFast(PreTrainedTokenizerFast):
     """
@@ -75,6 +84,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     slow_tokenizer_class = LlamaTokenizer
     padding_side = "left"
 

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
@@ -408,10 +408,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
         # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
-        new_tokens = [str(tok) for tok in new_tokens]
+        token_contents = [str(tok) for tok in new_tokens]
 
         tokens_to_add = []
-        for token in new_tokens:
+        for i, token in enumerate(token_contents):
             if not isinstance(token, str):
                 raise TypeError(f"Token {token} is not a string but a {type(token)}.")
             if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
@@ -422,6 +422,9 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
                 and token not in tokens_to_add
             ):
                 tokens_to_add.append(token)
+                if isinstance(new_tokens[i], AddedToken) or special_tokens:
+                    # tokens that are added using AddedToken are special tokens.
+                    self._additional_special_tokens.append(new_tokens[i])
                 if self.verbose:
                     logger.info(f"Adding {token} to the vocabulary")
 
@@ -430,12 +433,12 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
         self.added_tokens_encoder.update(added_tok_encoder)
         self.added_tokens_decoder.update(added_tok_decoder)
 
-        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
+        # Make sure we don't split on any special tokens (even if they were already in the vocab before e.g. for Albert)
         if special_tokens:
-            if len(new_tokens) == 1:
-                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
+            if len(token_contents) == 1:
+                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token_contents[0])
             else:
-                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
+                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(token_contents)))
         else:
             # Or on the newly added tokens
             if len(tokens_to_add) == 1:
@@ -530,6 +533,10 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
                     if tok_extended.lstrip and left:
                         tokens[i - 1] = left.rstrip()  # Opposite here
                 else:
+                    # there should be a list of additional tokens that are not special. These have to be in no split
+                    # but they are not special. By default any added token should have right and left strip to True
+                    # Apparently. We need to keep this behaviour
+
                     # We strip left and right by default
                     if right:
                         tokens[i + 1] = right.lstrip()

diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
@@ -51,7 +51,10 @@
 @require_tokenizers
 class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = LlamaTokenizer
-    test_rust_tokenizer = False
+    rust_tokenizer_class = LlamaTokenizerFast
+
+    # FIXME this does not work, support should come
+    # test_rust_tokenizer = True
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
@@ -2110,6 +2110,71 @@ def test_batch_encode_plus_batch_sequence_length(self):
                         encoded_sequences_batch_padded_2[key],
                     )
 
+    def test_added_token_are_never_split(self):
+        if not self.test_slow_tokenizer:
+            self.skipTest("Currently this test is only for slow tokenizers")
+            return
+        model_ids = self.tokenizer_class.pretrained_vocab_files_map[self.from_pretrained_vocab_key].keys()
+        tokenizer = self.tokenizer_class.from_pretrained(list(model_ids)[0])
+        new_tokens = []
+        new_tokens.append(AddedToken("<lstrip=False, rstrip=False>", lstrip=False, rstrip=False))
+        new_tokens.append(AddedToken("<lstrip=True, rstrip=False>", lstrip=True, rstrip=False))
+        new_tokens.append(AddedToken("<lstrip=False, rstrip=True>", lstrip=False, rstrip=True))
+        new_tokens.append(AddedToken("<lstrip=True, rstrip=True>", lstrip=True, rstrip=True))
+
+        for token in new_tokens:
+            with self.subTest(f"testing with {token.content[1:-1]}"):
+                space = tokenizer.tokenize(" ")[0]
+                if len(space) > 1:
+                    # BPE adds a spiece underline
+                    space = space[-1]
+
+                tokenizer.add_tokens([token])
+                tokens = tokenizer.tokenize(f"This sentence is{token}a test")
+                self.assertIn(token.content, tokens)
+
+                tokens = tokenizer.tokenize(f"This sentence is {token}a test")
+                self.assertIn(token.content, tokens)
+
+                if not token.rstrip:
+                    idx = tokens.index(token.content)
+                    self.assertIn(space, tokens[idx - 1])
+                else:
+                    idx = tokens.index(token.content)
+                    self.assertNotIn(space, tokens[idx - 1])
+
+                tokens = tokenizer.tokenize(f"This sentence is{token} a test")
+                self.assertIn(token.content, tokens)
+                idx = tokens.index(token.content)
+
+                if not token.lstrip:
+                    idx = tokens.index(token.content)
+                    self.assertIn(space, tokens[idx + 1])
+                else:
+                    idx = tokens.index(token.content)
+                    self.assertNotIn(space, tokens[idx - 1])
+
+                tokens = tokenizer.tokenize(f"This sentence is {token} a test")
+                self.assertIn(token.content, tokens)
+
+                idx = tokens.index(token.content)
+
+                if not token.lstrip:
+                    idx = tokens.index(token.content)
+                    self.assertIn(space, tokens[idx + 1])
+                else:
+                    idx = tokens.index(token.content)
+                    self.assertNotIn(space, tokens[idx + 1])
+
+                if not token.rstrip:
+                    idx = tokens.index(token.content)
+                    self.assertIn(space, tokens[idx - 1])
+                else:
+                    idx = tokens.index(token.content)
+                    self.assertNotIn(space, tokens[idx - 1])
+
+        # for non BPE based tokenizers we need to test that lstrip and rstrip are respected
+
     @require_tokenizers
     def test_added_token_are_matched_longest_first(self):
         if not self.test_slow_tokenizer: