diff --git a/flair/data.py b/flair/data.py index 6ca20f4edd..78ec17dfa4 100644 --- a/flair/data.py +++ b/flair/data.py @@ -10,7 +10,6 @@ from segtok.tokenizer import split_contractions from segtok.tokenizer import word_tokenizer - log = logging.getLogger(__name__) @@ -590,10 +589,11 @@ def __len__(self) -> int: class TaggedCorpus: - def __init__(self, train: List[Sentence], dev: List[Sentence], test: List[Sentence]): + def __init__(self, train: List[Sentence], dev: List[Sentence], test: List[Sentence], name: str = 'corpus'): self.train: List[Sentence] = train self.dev: List[Sentence] = dev self.test: List[Sentence] = test + self.name: str = name def downsample(self, percentage: float = 0.1, only_downsample_train=False): @@ -791,3 +791,53 @@ def iob_iobes(tags): else: raise Exception('Invalid IOB format!') return new_tags + + +class MultiCorpus: + def __init__(self, corpora: List[TaggedCorpus]): + self.corpora: List[TaggedCorpus] = corpora + + def __str__(self): + return '\n'.join([str(corpus) for corpus in self.corpora]) + + def make_tag_dictionary(self, tag_type: str) -> Dictionary: + + # Make the tag dictionary + tag_dictionary: Dictionary = Dictionary() + tag_dictionary.add_item('O') + for corpus in self.corpora: + for sentence in corpus.get_all_sentences(): + for token in sentence.tokens: + token: Token = token + tag_dictionary.add_item(token.get_tag(tag_type).value) + tag_dictionary.add_item('') + tag_dictionary.add_item('') + return tag_dictionary + + def downsample(self, percentage: float = 0.1, only_downsample_train=False): + + for corpus in self.corpora: + corpus.downsample(percentage, only_downsample_train) + + return self + + @property + def train(self) -> List[Sentence]: + train: List[Sentence] = [] + for corpus in self.corpora: + train.extend(corpus.train) + return train + + @property + def dev(self) -> List[Sentence]: + dev: List[Sentence] = [] + for corpus in self.corpora: + dev.extend(corpus.dev) + return dev + + @property + def test(self) -> List[Sentence]: + test: List[Sentence] = [] + for corpus in self.corpora: + test.extend(corpus.test) + return test diff --git a/flair/data_fetcher.py b/flair/data_fetcher.py index a2e25674ce..dbb1ecad28 100644 --- a/flair/data_fetcher.py +++ b/flair/data_fetcher.py @@ -1,29 +1,35 @@ -from typing import List, Dict +from typing import List, Dict, Union import re import os import logging from enum import Enum from pathlib import Path -from flair.data import Sentence, TaggedCorpus, Token +from flair.data import Sentence, TaggedCorpus, Token, MultiCorpus log = logging.getLogger(__name__) class NLPTask(Enum): - # conll column format - CONLL_03 = 'conll_03' + # conll 2000 column format CONLL_2000 = 'conll_2000' - CONLL_03_GERMAN = 'conll_03-ger' - ONTONER = 'onto-ner' - FASHION = 'fashion' - GERMEVAL = 'germeval' - SRL = 'srl' - WSD = 'wsd' + + # conll 03 NER column format + CONLL_03 = 'conll_03' + CONLL_03_GERMAN = 'conll_03_german' + CONLL_03_DUTCH = 'conll_03_dutch' + CONLL_03_SPANISH = 'conll_03_spanish' # conll-u format UD_ENGLISH = 'ud_english' UD_GERMAN = 'ud_german' + + # other datasets + ONTONER = 'ontoner' + FASHION = 'fashion' + GERMEVAL = 'germeval' + SRL = 'srl' + WSD = 'wsd' CONLL_12 = 'conll_12' PENN = 'penn' ONTONOTES = 'ontonotes' @@ -36,7 +42,11 @@ class NLPTask(Enum): class NLPTaskDataFetcher: @staticmethod - def fetch_data(task: NLPTask, base_path: Path = None) -> TaggedCorpus: + def fetch_corpora(tasks: List[Union[NLPTask, str]], base_path: Path = None) -> MultiCorpus: + return MultiCorpus([NLPTaskDataFetcher.fetch_data(task, base_path) for task in tasks]) + + @staticmethod + def fetch_data(task: Union[NLPTask, str], base_path: Path = None) -> TaggedCorpus: """ Helper function to fetch a TaggedCorpus for a specific NLPTask. For this to work you need to first download and put into the appropriate folder structure the corresponsing NLP task data. The tutorials on @@ -50,143 +60,82 @@ def fetch_data(task: NLPTask, base_path: Path = None) -> TaggedCorpus: if not base_path: base_path = Path('resources') / 'tasks' - data_folder = str(base_path / str(task.value).lower()) + # get string value if enum is passed + task = task.value if type(task) is NLPTask else task - log.info("Reading data from {}".format(data_folder)) + data_folder = str(base_path / task.lower()) # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk) - if task == NLPTask.CONLL_2000: + if task == NLPTask.CONLL_2000.value: columns = {0: 'text', 1: 'pos', 2: 'np'} return NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, - train_file='train.txt', - test_file='test.txt', tag_to_biloes='np') # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag - if task == NLPTask.CONLL_03 or task == NLPTask.ONTONER or task == NLPTask.FASHION: + if task == NLPTask.CONLL_03.value or task == NLPTask.ONTONER.value or task == NLPTask.FASHION.value: columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, - train_file='eng.train', - test_file='eng.testb', - dev_file='eng.testa', tag_to_biloes='ner', ) # the CoNLL 03 task for German has an additional lemma column - if task == NLPTask.CONLL_03_GERMAN: + if task == NLPTask.CONLL_03_GERMAN.value: columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, - train_file='deu.train', - test_file='deu.testb', - dev_file='deu.testa', + tag_to_biloes='ner') + + # the CoNLL 03 task for Dutch has no NP column + if task == NLPTask.CONLL_03_DUTCH.value: + columns = {0: 'text', 1: 'pos', 2: 'ner'} + + return NLPTaskDataFetcher.fetch_column_corpus(data_folder, + columns, + tag_to_biloes='ner') + + # the CoNLL 03 task for Spanish only has two columns + if task == NLPTask.CONLL_03_SPANISH.value: + columns = {0: 'text', 1: 'ner'} + + return NLPTaskDataFetcher.fetch_column_corpus(data_folder, + columns, tag_to_biloes='ner') # the GERMEVAL task only has two columns: text and ner - if task == NLPTask.GERMEVAL: + if task == NLPTask.GERMEVAL.value: columns = {1: 'text', 2: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, - train_file='NER-de-train.tsv', - test_file='NER-de-test.tsv', - dev_file='NER-de-dev.tsv', tag_to_biloes='ner') # WSD tasks may be put into this column format - if task == NLPTask.WSD: + if task == NLPTask.WSD.value: columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'sense'} - return NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, train_file='semcor.tsv', test_file='semeval2015.tsv') # the UD corpora follow the CoNLL-U format, for which we have a special reader - if task == NLPTask.UD_ENGLISH: - # get train, test and dev data - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'en_ewt-ud-train.conllu')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'en_ewt-ud-test.conllu')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'en_ewt-ud-dev.conllu')) - - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) - - if task == NLPTask.UD_GERMAN: - # get train, test and dev data - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'de_gsd-ud-train.conllu')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'de_gsd-ud-test.conllu')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'de_gsd-ud-dev.conllu')) - - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) - - if task == NLPTask.ONTONOTES: - # get train, test and dev data - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'train.conllu')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'test.conllu')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'dev.conllu')) - - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) - - if task == NLPTask.CONLL_12: - # get train, test and dev data - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'train.propbank.conllu')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'test.propbank.conllu')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'dev.propbank.conllu')) - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) - - if task == NLPTask.PENN: - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'train.conll')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'valid.conll')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( - os.path.join(data_folder, 'test.conll')) - - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) - - # for text classifiers, we use our own special format - if task == NLPTask.IMDB: - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'train.txt')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'dev.txt')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'test.txt')) - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) + if task.startswith('ud_') or task in [NLPTask.ONTONOTES.value, NLPTask.CONLL_12.value, NLPTask.PENN.value]: + return NLPTaskDataFetcher.fetch_ud_corpus(data_folder) # for text classifiers, we use our own special format - if task == NLPTask.AG_NEWS: - sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'train.txt')) - sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'dev.txt')) - sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( - os.path.join(data_folder, 'test.txt')) - return TaggedCorpus(sentences_train, sentences_dev, sentences_test) + if task == NLPTask.IMDB.value or task == NLPTask.AG_NEWS.value: + return NLPTaskDataFetcher.fetch_classification_corpus(data_folder) @staticmethod def fetch_column_corpus( data_folder: Path, column_format: Dict[int, str], - train_file: str, - test_file: str, + train_file=None, + test_file=None, dev_file=None, tag_to_biloes=None) -> TaggedCorpus: """ @@ -204,6 +153,25 @@ def fetch_column_corpus( # TODO: move all paths to use pathlib.Path, for now convert to str for compatibility data_folder = str(data_folder) + # automatically identify train / test / dev files + if train_file is None: + for file in os.listdir(data_folder): + if 'train' in file and not '54019' in file: + train_file = file + if 'test' in file: + test_file = file + if 'dev' in file: + dev_file = file + if 'testa' in file: + dev_file = file + if 'testb' in file: + test_file = file + + log.info("Reading data from {}".format(data_folder)) + log.info("Train: {}".format(train_file)) + log.info("Dev: {}".format(dev_file)) + log.info("Test: {}".format(test_file)) + # get train and test data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data( os.path.join(data_folder, train_file), column_format) @@ -224,8 +192,143 @@ def fetch_column_corpus( sentence: Sentence = sentence sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme='iobes') + return TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder) + + @staticmethod + def fetch_ud_corpus( + data_folder: Path, + train_file=None, + test_file=None, + dev_file=None) -> TaggedCorpus: + """ + Helper function to get a TaggedCorpus from CoNLL-U column-formatted task data such as the UD corpora + + :param data_folder: base folder with the task data + :param train_file: the name of the train file + :param test_file: the name of the test file + :param dev_file: the name of the dev file, if None, dev data is sampled from train + :return: a TaggedCorpus with annotated train, dev and test data + """ + + # TODO: move all paths to use pathlib.Path, for now convert to str for compatibility + data_folder = str(data_folder) + + # automatically identify train / test / dev files + if train_file is None: + for file in os.listdir(data_folder): + if 'train' in file: + train_file = file + if 'test' in file: + test_file = file + if 'dev' in file: + dev_file = file + if 'testa' in file: + dev_file = file + if 'testb' in file: + test_file = file + + log.info("Reading data from {}".format(data_folder)) + log.info("Train: {}".format(train_file)) + log.info("Dev: {}".format(dev_file)) + log.info("Test: {}".format(test_file)) + + sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( + os.path.join(data_folder, train_file)) + sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( + os.path.join(data_folder, test_file)) + sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( + os.path.join(data_folder, dev_file)) + + return TaggedCorpus(sentences_train, sentences_dev, sentences_test) + + @staticmethod + def fetch_classification_corpus( + data_folder: Path, + train_file=None, + test_file=None, + dev_file=None) -> TaggedCorpus: + """ + Helper function to get a TaggedCorpus from text classification-formatted task data + + :param data_folder: base folder with the task data + :param train_file: the name of the train file + :param test_file: the name of the test file + :param dev_file: the name of the dev file, if None, dev data is sampled from train + :return: a TaggedCorpus with annotated train, dev and test data + """ + + # TODO: move all paths to use pathlib.Path, for now convert to str for compatibility + data_folder = str(data_folder) + + # automatically identify train / test / dev files + if train_file is None: + for file in os.listdir(data_folder): + if 'train' in file: + train_file = file + if 'test' in file: + test_file = file + if 'dev' in file: + dev_file = file + if 'testa' in file: + dev_file = file + if 'testb' in file: + test_file = file + + log.info("Reading data from {}".format(data_folder)) + log.info("Train: {}".format(train_file)) + log.info("Dev: {}".format(dev_file)) + log.info("Test: {}".format(test_file)) + + sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( + os.path.join(data_folder, train_file)) + sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( + os.path.join(data_folder, test_file)) + sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( + os.path.join(data_folder, dev_file)) + return TaggedCorpus(sentences_train, sentences_dev, sentences_test) + @staticmethod + def read_text_classification_file(path_to_file, max_tokens_per_doc=-1): + """ + Reads a data file for text classification. The file should contain one document/text per line. + The line should have the following format: + __label__ + If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., + __label__ __label__ + :param path_to_file: the path to the data file + :param max_tokens_per_doc: Take only documents that contain number of tokens less or equal to this value. If + set to -1 all documents are taken. + :return: list of sentences + """ + label_prefix = '__label__' + sentences = [] + + with open(path_to_file) as f: + for line in f: + words = line.split() + + labels = [] + l_len = 0 + + for i in range(len(words)): + if words[i].startswith(label_prefix): + l_len += len(words[i]) + 1 + label = words[i].replace(label_prefix, "") + labels.append(label) + else: + break + + text = line[l_len:].strip() + + if text and labels: + sentence = Sentence(text, labels=labels, use_tokenizer=True) + if (len(sentence) > max_tokens_per_doc): + sentence.tokens = sentence.tokens[:max_tokens_per_doc] + sentences.append(sentence) + + return sentences + @staticmethod def read_column_data(path_to_column_file: str, column_name_map: Dict[int, str], @@ -242,7 +345,11 @@ def read_column_data(path_to_column_file: str, """ sentences: List[Sentence] = [] - lines: List[str] = open(path_to_column_file).read().strip().split('\n') + try: + lines: List[str] = open(path_to_column_file, encoding='utf-8').read().strip().split('\n') + except: + log.info('UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(path_to_column_file)) + lines: List[str] = open(path_to_column_file, encoding='latin1').read().strip().split('\n') # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 @@ -359,7 +466,7 @@ def read_text_classification_file(path_to_file, max_tokens_per_doc=-1): if text and labels: sentence = Sentence(text, labels=labels, use_tokenizer=True) - if(len(sentence) > max_tokens_per_doc): + if (len(sentence) > max_tokens_per_doc): sentence.tokens = sentence.tokens[:max_tokens_per_doc] sentences.append(sentence)