From 6c7eb5da3726541e6b70f5d819a159ed3df6f116 Mon Sep 17 00:00:00 2001 From: Josip Krapac Date: Fri, 29 Jan 2021 09:37:40 +0100 Subject: [PATCH 1/4] Added the DocumentTFIDFEmbeddings --- flair/embeddings/document.py | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py index d5f70a9efa..6db8ed6b1e 100644 --- a/flair/embeddings/document.py +++ b/flair/embeddings/document.py @@ -12,6 +12,8 @@ from flair.embeddings.token import TokenEmbeddings, StackedEmbeddings, FlairEmbeddings from flair.nn import LockedDropout, WordDropout +from sklearn.feature_extraction.text import TfidfVectorizer + log = logging.getLogger("flair") @@ -274,6 +276,48 @@ def extra_repr(self): return f"fine_tune_mode={self.fine_tune_mode}, pooling={self.pooling}" +class DocumentTFIDFEmbeddings(DocumentEmbeddings): + def __init__( + self, + train_dataset, + vectorizer_parms = {} + ): + """The constructor for DocumentTFIDFEmbeddings. + :param train_dataset: the train dataset which will be used to construct vectorizer + :param vectorizer_parms: the dictionary of parameters given to Scikit-learn's TfidfVectorizer constructor + """ + super().__init__() + + self.vectorizer = TfidfVectorizer(**vectorizer_parms) + self.vectorizer.fit([s.to_original_text() for s in train_dataset]) + + self.__embedding_length: int = len(self.vectorizer.vocabulary_) + + self.to(flair.device) + + self.name: str = f"document_tfidf" + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def embed(self, sentences: Union[List[Sentence], Sentence]): + """Add embeddings to every sentence in the given list of sentences.""" + + # if only one sentence is passed, convert to list of sentence + if isinstance(sentences, Sentence): + sentences = [sentences] + + raw_sentences = [s.to_original_text() for s in sentences] + tfidf_vectors = torch.from_numpy(self.vectorizer.transform(raw_sentences).A) + + for sentence_id, sentence in enumerate(sentences): + sentence.set_embedding(self.name, tfidf_vectors[sentence_id]) + + def _add_embeddings_internal(self, sentences: List[Sentence]): + pass + + class DocumentRNNEmbeddings(DocumentEmbeddings): def __init__( self, From 22451581164fb34f4ee8eac3a209dc12427ef1c2 Mon Sep 17 00:00:00 2001 From: Josip Krapac Date: Mon, 1 Feb 2021 13:16:05 +0100 Subject: [PATCH 2/4] Added DocumentTFIDFEmbeddings to __init__.py of embeddings folder --- flair/embeddings/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flair/embeddings/__init__.py b/flair/embeddings/__init__.py index a37a153e5d..9776320ad5 100644 --- a/flair/embeddings/__init__.py +++ b/flair/embeddings/__init__.py @@ -23,6 +23,7 @@ from .document import DocumentEmbeddings from .document import TransformerDocumentEmbeddings from .document import DocumentPoolEmbeddings +from .document import DocumentTFIDFEmbeddings from .document import DocumentRNNEmbeddings from .document import DocumentLMEmbeddings from .document import SentenceTransformerDocumentEmbeddings @@ -47,4 +48,4 @@ from .legacy import BertEmbeddings from .legacy import DocumentMeanEmbeddings from .legacy import DocumentLSTMEmbeddings -from .legacy import ELMoTransformerEmbeddings \ No newline at end of file +from .legacy import ELMoTransformerEmbeddings From abdaaa30ac2b57e6bdd02f17c6d152a1cad7e448 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Tue, 2 Feb 2021 13:33:31 +0100 Subject: [PATCH 3/4] GH-2085: Cast to float tensor --- flair/embeddings/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py index 55d78e7269..d4cb57ee3c 100644 --- a/flair/embeddings/document.py +++ b/flair/embeddings/document.py @@ -368,7 +368,7 @@ def embed(self, sentences: Union[List[Sentence], Sentence]): sentences = [sentences] raw_sentences = [s.to_original_text() for s in sentences] - tfidf_vectors = torch.from_numpy(self.vectorizer.transform(raw_sentences).A) + tfidf_vectors = torch.from_numpy(self.vectorizer.transform(raw_sentences).A).float() for sentence_id, sentence in enumerate(sentences): sentence.set_embedding(self.name, tfidf_vectors[sentence_id]) From dd2c3db3b178674c025e897369702afcd733c9d2 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Tue, 2 Feb 2021 14:00:43 +0100 Subject: [PATCH 4/4] GH-2085: Replace cast with scikit param --- flair/embeddings/document.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py index d4cb57ee3c..f743aed7fe 100644 --- a/flair/embeddings/document.py +++ b/flair/embeddings/document.py @@ -339,15 +339,16 @@ class DocumentTFIDFEmbeddings(DocumentEmbeddings): def __init__( self, train_dataset, - vectorizer_parms = {} + **vectorizer_params, ): """The constructor for DocumentTFIDFEmbeddings. :param train_dataset: the train dataset which will be used to construct vectorizer - :param vectorizer_parms: the dictionary of parameters given to Scikit-learn's TfidfVectorizer constructor + :param vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor """ super().__init__() - self.vectorizer = TfidfVectorizer(**vectorizer_parms) + import numpy as np + self.vectorizer = TfidfVectorizer(dtype=np.float32, **vectorizer_params) self.vectorizer.fit([s.to_original_text() for s in train_dataset]) self.__embedding_length: int = len(self.vectorizer.vocabulary_) @@ -368,7 +369,7 @@ def embed(self, sentences: Union[List[Sentence], Sentence]): sentences = [sentences] raw_sentences = [s.to_original_text() for s in sentences] - tfidf_vectors = torch.from_numpy(self.vectorizer.transform(raw_sentences).A).float() + tfidf_vectors = torch.from_numpy(self.vectorizer.transform(raw_sentences).A) for sentence_id, sentence in enumerate(sentences): sentence.set_embedding(self.name, tfidf_vectors[sentence_id])