diff --git a/flair/data.py b/flair/data.py index 3d1cc3df41..e3cc6fbf76 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from typing import List, Dict, Union import torch @@ -593,27 +594,75 @@ def __len__(self) -> int: return len(self.tokens) -class TaggedCorpus: +class Corpus: + + @property + @abstractmethod + def train(self) -> List[Sentence]: + pass + + @property + @abstractmethod + def dev(self) -> List[Sentence]: + pass + + @property + @abstractmethod + def test(self) -> List[Sentence]: + pass + + @abstractmethod + def downsample(self, percentage: float = 0.1, only_downsample_train=False): + """Downsamples this corpus to a percentage of the sentences.""" + pass + + @abstractmethod + def get_all_sentences(self) -> List[Sentence]: + """Gets all sentences in the corpus (train, dev and test splits together).""" + pass + + @abstractmethod + def make_tag_dictionary(self, tag_type: str) -> Dictionary: + """Produces a dictionary of token tags of tag_type.""" + pass + + @abstractmethod + def make_label_dictionary(self) -> Dictionary: + """ + Creates a dictionary of all labels assigned to the sentences in the corpus. + :return: dictionary of labels + """ + pass + + +class TaggedCorpus(Corpus): def __init__(self, train: List[Sentence], dev: List[Sentence], test: List[Sentence], name: str = 'corpus'): - self.train: List[Sentence] = train - self.dev: List[Sentence] = dev - self.test: List[Sentence] = test + self._train: List[Sentence] = train + self._dev: List[Sentence] = dev + self._test: List[Sentence] = test self.name: str = name + @property + def train(self) -> List[Sentence]: + return self._train + + @property + def dev(self) -> List[Sentence]: + return self._dev + + @property + def test(self) -> List[Sentence]: + return self._test + def downsample(self, percentage: float = 0.1, only_downsample_train=False): - self.train = self._downsample_to_proportion(self.train, percentage) + self._train = self._downsample_to_proportion(self.train, percentage) if not only_downsample_train: - self.dev = self._downsample_to_proportion(self.dev, percentage) - self.test = self._downsample_to_proportion(self.test, percentage) + self._dev = self._downsample_to_proportion(self.dev, percentage) + self._test = self._downsample_to_proportion(self.test, percentage) return self - def clear_embeddings(self): - for sentence in self.get_all_sentences(): - for token in sentence.tokens: - token.clear_embeddings() - def get_all_sentences(self) -> List[Sentence]: all_sentences: List[Sentence] = [] all_sentences.extend(self.train) @@ -798,13 +847,48 @@ def iob_iobes(tags): return new_tags -class MultiCorpus: +class MultiCorpus(Corpus): + def __init__(self, corpora: List[TaggedCorpus]): self.corpora: List[TaggedCorpus] = corpora + @property + def train(self) -> List[Sentence]: + train: List[Sentence] = [] + for corpus in self.corpora: + train.extend(corpus.train) + return train + + @property + def dev(self) -> List[Sentence]: + dev: List[Sentence] = [] + for corpus in self.corpora: + dev.extend(corpus.dev) + return dev + + @property + def test(self) -> List[Sentence]: + test: List[Sentence] = [] + for corpus in self.corpora: + test.extend(corpus.test) + return test + def __str__(self): return '\n'.join([str(corpus) for corpus in self.corpora]) + def get_all_sentences(self) -> List[Sentence]: + sentences = [] + for corpus in self.corpora: + sentences.extend(corpus.get_all_sentences()) + return sentences + + def downsample(self, percentage: float = 0.1, only_downsample_train=False): + + for corpus in self.corpora: + corpus.downsample(percentage, only_downsample_train) + + return self + def make_tag_dictionary(self, tag_type: str) -> Dictionary: # Make the tag dictionary @@ -819,30 +903,13 @@ def make_tag_dictionary(self, tag_type: str) -> Dictionary: tag_dictionary.add_item('') return tag_dictionary - def downsample(self, percentage: float = 0.1, only_downsample_train=False): - - for corpus in self.corpora: - corpus.downsample(percentage, only_downsample_train) - - return self + def make_label_dictionary(self) -> Dictionary: - @property - def train(self) -> List[Sentence]: - train: List[Sentence] = [] + label_dictionary: Dictionary = Dictionary(add_unk=False) for corpus in self.corpora: - train.extend(corpus.train) - return train + labels = set(corpus._get_all_label_names()) - @property - def dev(self) -> List[Sentence]: - dev: List[Sentence] = [] - for corpus in self.corpora: - dev.extend(corpus.dev) - return dev + for label in labels: + label_dictionary.add_item(label) - @property - def test(self) -> List[Sentence]: - test: List[Sentence] = [] - for corpus in self.corpora: - test.extend(corpus.test) - return test + return label_dictionary diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 07d7043fd5..35880a8789 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -9,7 +9,7 @@ import flair import flair.nn -from flair.data import Sentence, Token, TaggedCorpus, Label +from flair.data import Sentence, Token, Label, MultiCorpus, Corpus from flair.models import TextClassifier, SequenceTagger from flair.training_utils import Metric, init_output_file, WeightExtractor, clear_embeddings @@ -18,9 +18,9 @@ class ModelTrainer: - def __init__(self, model: flair.nn.Model, corpus: TaggedCorpus) -> None: + def __init__(self, model: flair.nn.Model, corpus: Corpus) -> None: self.model: flair.nn.Model = model - self.corpus: TaggedCorpus = corpus + self.corpus: Corpus = corpus def train(self, base_path: str, @@ -202,6 +202,14 @@ def train(self, f'{test_metric.f_score(class_name):.4f}') self._log_line() + # if we are training over multiple datasets, do evaluation for each + if type(self.corpus) is MultiCorpus: + for subcorpus in self.corpus.corpora: + self._log_line() + self._log_line() + test_metric, test_loss = self._calculate_evaluation_results_for( + subcorpus.name, subcorpus.test, embeddings_in_memory, mini_batch_size, base_path + '/test.tsv') + return test_metric.micro_avg_f_score() def _calculate_evaluation_results_for(self, dataset_name, dataset, embeddings_in_memory, mini_batch_size, diff --git a/tests/test_data_fetchers.py b/tests/test_data_fetchers.py index bbc1e4f7fe..a9c0eaa8b8 100644 --- a/tests/test_data_fetchers.py +++ b/tests/test_data_fetchers.py @@ -52,4 +52,13 @@ def test_load_no_dev_data(tasks_base_path): assert len(corpus.train) == 5 assert len(corpus.dev) == 1 - assert len(corpus.test) == 1 \ No newline at end of file + assert len(corpus.test) == 1 + + +def test_multi_corpus(tasks_base_path): + # get two corpora as one + corpus = NLPTaskDataFetcher.fetch_corpora([NLPTask.FASHION, NLPTask.GERMEVAL]) + + assert len(corpus.train) == 8 + assert len(corpus.dev) == 2 + assert len(corpus.test) == 2 \ No newline at end of file diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py index 598d7ba52c..6c7e7da2d8 100644 --- a/tests/test_model_integration.py +++ b/tests/test_model_integration.py @@ -307,3 +307,35 @@ def test_train_language_model(results_base_path, resources_path): # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True) + + +@pytest.mark.integration +def test_train_load_use_tagger(results_base_path, tasks_base_path): + + corpus = NLPTaskDataFetcher.fetch_corpora([NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) + tag_dictionary = corpus.make_tag_dictionary('ner') + + embeddings = WordEmbeddings('glove') + + tagger: SequenceTagger = SequenceTagger(hidden_size=256, + embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type='ner', + use_crf=False) + + # initialize trainer + trainer: ModelTrainer = ModelTrainer(tagger, corpus) + + trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) + + loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') + + sentence = Sentence('I love Berlin') + sentence_empty = Sentence(' ') + + loaded_model.predict(sentence) + loaded_model.predict([sentence, sentence_empty]) + loaded_model.predict([sentence_empty]) + + # clean up results directory + shutil.rmtree(results_base_path)