GH-1492: truncation to 512 subtokens in TransformerDocumentEmbeddings

flairNLP · Mar 30, 2020 · 03ed8c5 · 03ed8c5
1 parent 5f118a7
commit 03ed8c5
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/flair/embeddings.py b/flair/embeddings.py
@@ -2483,7 +2483,8 @@ def _add_embeddings_to_sentences(self, sentences: List[Sentence]):
 
             # subtokenize sentences
             for sentence in sentences:
-                subtokenized_sentence = self.tokenizer.encode(sentence.to_tokenized_string(), add_special_tokens=True)
+                # tokenize and truncate to 512 subtokens (TODO: check better truncation strategies)
+                subtokenized_sentence = self.tokenizer.encode(sentence.to_tokenized_string(), add_special_tokens=True)[:512]
                 subtokenized_sentences.append(
                     torch.tensor(subtokenized_sentence, dtype=torch.long, device=flair.device))