Skip to content

Commit

Permalink
Merge pull request #879 from pranaychandekar/GH-5-fasttext-oov-embedd…
Browse files Browse the repository at this point in the history
…ings

 GH-5: Added new type of embeddings: FastTextEmbeddings with documentation
  • Loading branch information
yosipk authored Jul 16, 2019
2 parents 6c27431 + 8ce3cce commit 29b6c58
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 1 deletion.
70 changes: 70 additions & 0 deletions flair/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,76 @@ def extra_repr(self):
return f"'{self.embeddings}'"


class FastTextEmbeddings(TokenEmbeddings):
"""FastText Embeddings with oov functionality"""

def __init__(self, embeddings: str, use_local: bool = True, field: str = None):
"""
Initializes fasttext word embeddings. Constructor downloads required embedding file and stores in cache
if use_local is False.
:param embeddings: path to your embeddings '.bin' file
:param use_local: set this to False if you are using embeddings from a remote source
"""

cache_dir = Path("embeddings")

if use_local:
if not Path(embeddings).exists():
raise ValueError(
f'The given embeddings "{embeddings}" is not available or is not a valid path.'
)
else:
embeddings = cached_path(f"{embeddings}", cache_dir=cache_dir)

self.embeddings = embeddings

self.name: str = str(embeddings)

self.static_embeddings = True

self.precomputed_word_embeddings = gensim.models.FastText.load_fasttext_format(
str(embeddings)
)

self.__embedding_length: int = self.precomputed_word_embeddings.vector_size

self.field = field
super().__init__()

@property
def embedding_length(self) -> int:
return self.__embedding_length

def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:

for i, sentence in enumerate(sentences):

for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):

if "field" not in self.__dict__ or self.field is None:
word = token.text
else:
word = token.get_tag(self.field).value

try:
word_embedding = self.precomputed_word_embeddings[word]
except:
word_embedding = np.zeros(self.embedding_length, dtype="float")

word_embedding = torch.FloatTensor(word_embedding)

token.set_embedding(self.name, word_embedding)

return sentences

def __str__(self):
return self.name

def extra_repr(self):
return f"'{self.embeddings}'"


class OneHotEmbeddings(TokenEmbeddings):
"""One-hot encoded embeddings."""

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ bpemb>=0.2.9
regex
tabulate
urllib3<1.25,>=1.20
langdetect
langdetect
27 changes: 27 additions & 0 deletions resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,33 @@ word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttex
word_vectors.save('/path/to/converted')
```

However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class.


## FastText Embeddings

FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information. To use this functionality with Flair, use `FastTextEmbeddings` class as shown:

```python
from flair.embeddings import FastTextEmbeddings

# init embedding
embedding = FastTextEmbeddings('/path/to/local/custom_fasttext_embeddings.bin')

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
embedding.embed(sentence)
```

You can initialize the class by passing the remote downloadable URL as well.

```python
embedding = FastTextEmbeddings('/path/to/remote/downloadable/custom_fasttext_embeddings.bin', use_local=False)
```


## Character Embeddings

Some embeddings - such as character-features - are not pre-trained but rather trained on the downstream task. Normally
Expand Down

0 comments on commit 29b6c58

Please sign in to comment.