flairNLP · yosipk · Jul 16, 2019 · Jul 13, 2019 · Jul 13, 2019 · Jul 13, 2019
diff --git a/flair/embeddings.py b/flair/embeddings.py
@@ -345,6 +345,76 @@ def extra_repr(self):
         return f"'{self.embeddings}'"
 
 
+class FastTextEmbeddings(TokenEmbeddings):
+    """FastText Embeddings with oov functionality"""
+
+    def __init__(self, embeddings: str, use_local: bool = True, field: str = None):
+        """
+        Initializes fasttext word embeddings. Constructor downloads required embedding file and stores in cache
+        if use_local is False.
+
+        :param embeddings: path to your embeddings '.bin' file
+        :param use_local: set this to False if you are using embeddings from a remote source
+        """
+
+        cache_dir = Path("embeddings")
+
+        if use_local:
+            if not Path(embeddings).exists():
+                raise ValueError(
+                    f'The given embeddings "{embeddings}" is not available or is not a valid path.'
+                )
+        else:
+            embeddings = cached_path(f"{embeddings}", cache_dir=cache_dir)
+
+        self.embeddings = embeddings
+
+        self.name: str = str(embeddings)
+
+        self.static_embeddings = True
+
+        self.precomputed_word_embeddings = gensim.models.FastText.load_fasttext_format(
+            str(embeddings)
+        )
+
+        self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
+
+        self.field = field
+        super().__init__()
+
+    @property
+    def embedding_length(self) -> int:
+        return self.__embedding_length
+
+    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
+
+        for i, sentence in enumerate(sentences):
+
+            for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):
+
+                if "field" not in self.__dict__ or self.field is None:
+                    word = token.text
+                else:
+                    word = token.get_tag(self.field).value
+
+                try:
+                    word_embedding = self.precomputed_word_embeddings[word]
+                except:
+                    word_embedding = np.zeros(self.embedding_length, dtype="float")
+
+                word_embedding = torch.FloatTensor(word_embedding)
+
+                token.set_embedding(self.name, word_embedding)
+
+        return sentences
+
+    def __str__(self):
+        return self.name
+
+    def extra_repr(self):
+        return f"'{self.embeddings}'"
+
+
 class OneHotEmbeddings(TokenEmbeddings):
     """One-hot encoded embeddings."""
 

diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,4 @@ bpemb>=0.2.9
 regex
 tabulate
 urllib3<1.25,>=1.20
-langdetect
+langdetect
diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -128,6 +128,33 @@ word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttex
 word_vectors.save('/path/to/converted')
 ```
 
+However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class. 
+
+
+## FastText Embeddings
+
+FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information. To use this functionality with Flair, use `FastTextEmbeddings` class as shown:
+
+```python
+from flair.embeddings import FastTextEmbeddings 
+
+# init embedding  
+embedding = FastTextEmbeddings('/path/to/local/custom_fasttext_embeddings.bin')  
+
+# create a sentence  
+sentence = Sentence('The grass is green .')  
+
+# embed words in sentence  
+embedding.embed(sentence)  
+```
+
+You can initialize the class by passing the remote downloadable URL as well.
+
+```python
+embedding = FastTextEmbeddings('/path/to/remote/downloadable/custom_fasttext_embeddings.bin', use_local=False)  
+```
+
+
 ## Character Embeddings
 
 Some embeddings - such as character-features - are not pre-trained but rather trained on the downstream task. Normally