From 770050e4bd36c5f6624e6fedbcb58f84d22b488c Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sat, 13 Mar 2021 16:24:01 +0100 Subject: [PATCH 1/4] add text pair dataset --- flair/datasets/document_classification.py | 86 ++++++++++++++--------- 1 file changed, 52 insertions(+), 34 deletions(-) diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index adeb7f2c21..c5b3cce3cc 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -11,7 +11,7 @@ Corpus, Token, FlairDataset, - Tokenizer + Tokenizer, DataPair ) from flair.tokenization import SegtokTokenizer, SpaceTokenizer from flair.datasets.base import find_train_dev_test_files @@ -454,9 +454,12 @@ def __init__( # most data sets have the token text in the first column, if not, pass 'text' as column self.text_columns: List[int] = [] + self.pair_columns: List[int] = [] for column in column_name_map: if column_name_map[column] == "text": self.text_columns.append(column) + if column_name_map[column] == "pair": + self.pair_columns.append(column) with open(self.path_to_file, encoding=encoding) as csv_file: @@ -488,26 +491,8 @@ def __init__( if self.in_memory: - text = " ".join( - [row[text_column] for text_column in self.text_columns] - ) - - if self.max_chars_per_doc > 0: - text = text[: self.max_chars_per_doc] - - sentence = Sentence(text, use_tokenizer=self.tokenizer) - - for column in self.column_name_map: - column_value = row[column] - if ( - self.column_name_map[column].startswith("label") - and column_value - ): - if column_value != self.no_class_label: - sentence.add_label(label_type, column_value) + sentence = self._make_labeled_data_point(row) - if 0 < self.max_tokens_per_doc < len(sentence): - sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] self.sentences.append(sentence) else: @@ -515,6 +500,52 @@ def __init__( self.total_sentence_count += 1 + def _make_labeled_data_point(self, row): + + # make sentence from text (and filter for length) + text = " ".join( + [row[text_column] for text_column in self.text_columns] + ) + + if self.max_chars_per_doc > 0: + text = text[: self.max_chars_per_doc] + + sentence = Sentence(text, use_tokenizer=self.tokenizer) + + if 0 < self.max_tokens_per_doc < len(sentence): + sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] + + # if a pair column is defined, make a sentence pair object + if len(self.pair_columns) > 0: + + text = " ".join( + [row[pair_column] for pair_column in self.pair_columns] + ) + + if self.max_chars_per_doc > 0: + text = text[: self.max_chars_per_doc] + + pair = Sentence(text, use_tokenizer=self.tokenizer) + + if 0 < self.max_tokens_per_doc < len(sentence): + pair.tokens = pair.tokens[: self.max_tokens_per_doc] + + data_point = DataPair(first=sentence, second=pair) + + else: + data_point = sentence + + for column in self.column_name_map: + column_value = row[column] + if ( + self.column_name_map[column].startswith("label") + and column_value + ): + if column_value != self.no_class_label: + data_point.add_label(self.label_type, column_value) + + return data_point + def is_in_memory(self) -> bool: return self.in_memory @@ -527,20 +558,7 @@ def __getitem__(self, index: int = 0) -> Sentence: else: row = self.raw_data[index] - text = " ".join([row[text_column] for text_column in self.text_columns]) - - if self.max_chars_per_doc > 0: - text = text[: self.max_chars_per_doc] - - sentence = Sentence(text, use_tokenizer=self.tokenizer) - for column in self.column_name_map: - column_value = row[column] - if self.column_name_map[column].startswith("label") and column_value: - if column_value != self.no_class_label: - sentence.add_label(self.label_type, column_value) - - if 0 < self.max_tokens_per_doc < len(sentence): - sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] + sentence = self._make_labeled_data_point(row) return sentence From bf79c803fabd54a7e60ea2ea25032d9616fae2aa Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sat, 13 Mar 2021 16:24:40 +0100 Subject: [PATCH 2/4] GH-2148: offset DOCSTARTS as single sentences --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 2b5ff9ae47..4b83c6d40c 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -3571,7 +3571,7 @@ def __init__( for row in posts: # Go through all the post titles - txtout.writelines("-DOCSTART-\n") # Start each post with a -DOCSTART- token + txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token # Keep track of how many and which entity mentions does a given post title have link_annots = [] # [start pos, end pos, wiki page title] of an entity mention From 4d81634689d9da8d9b4d508f021604f8210130e1 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sat, 13 Mar 2021 16:25:24 +0100 Subject: [PATCH 3/4] GH-2146: collapse saving parameters into one --- flair/trainers/trainer.py | 157 +++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 80 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index a21a38a897..021ac53633 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -39,12 +39,12 @@ class ModelTrainer: def __init__( - self, - model: flair.nn.Model, - corpus: Corpus, - optimizer: torch.optim.Optimizer = SGD, - epoch: int = 0, - use_tensorboard: bool = False, + self, + model: flair.nn.Model, + corpus: Corpus, + optimizer: torch.optim.Optimizer = SGD, + epoch: int = 0, + use_tensorboard: bool = False, ): """ Initialize a model trainer @@ -61,40 +61,39 @@ def __init__( self.use_tensorboard: bool = use_tensorboard def train( - self, - base_path: Union[Path, str], - learning_rate: float = 0.1, - mini_batch_size: int = 32, - mini_batch_chunk_size: int = None, - max_epochs: int = 100, - scheduler = AnnealOnPlateau, - cycle_momentum: bool = False, - anneal_factor: float = 0.5, - patience: int = 3, - initial_extra_patience = 0, - min_learning_rate: float = 0.0001, - train_with_dev: bool = False, - train_with_test: bool = False, - monitor_train: bool = False, - monitor_test: bool = False, - embeddings_storage_mode: str = "cpu", - checkpoint: bool = False, - save_final_model: bool = True, - anneal_with_restarts: bool = False, - anneal_with_prestarts: bool = False, - batch_growth_annealing: bool = False, - shuffle: bool = True, - param_selection_mode: bool = False, - write_weights: bool = False, - num_workers: int = 6, - sampler=None, - use_amp: bool = False, - amp_opt_level: str = "O1", - eval_on_train_fraction=0.0, - eval_on_train_shuffle=False, - save_model_at_each_epoch=False, - save_model_epoch_step: int = None, - **kwargs, + self, + base_path: Union[Path, str], + learning_rate: float = 0.1, + mini_batch_size: int = 32, + mini_batch_chunk_size: int = None, + max_epochs: int = 100, + scheduler=AnnealOnPlateau, + cycle_momentum: bool = False, + anneal_factor: float = 0.5, + patience: int = 3, + initial_extra_patience=0, + min_learning_rate: float = 0.0001, + train_with_dev: bool = False, + train_with_test: bool = False, + monitor_train: bool = False, + monitor_test: bool = False, + embeddings_storage_mode: str = "cpu", + checkpoint: bool = False, + save_final_model: bool = True, + anneal_with_restarts: bool = False, + anneal_with_prestarts: bool = False, + batch_growth_annealing: bool = False, + shuffle: bool = True, + param_selection_mode: bool = False, + write_weights: bool = False, + num_workers: int = 6, + sampler=None, + use_amp: bool = False, + amp_opt_level: str = "O1", + eval_on_train_fraction=0.0, + eval_on_train_shuffle=False, + save_model_each_k_epochs: int = 0, + **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. @@ -127,7 +126,8 @@ def train( if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch - :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved + :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will + be saved each 5 epochs. Default is 0 which means no model saving. :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved :param kwargs: Other arguments for the Optimizer :return: @@ -236,17 +236,18 @@ def train( # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" - + if scheduler == OneCycleLR: dataset_size = len(self.corpus.train) if train_with_dev: dataset_size += len(self.corpus.dev) lr_scheduler = OneCycleLR(optimizer, - max_lr=learning_rate, - steps_per_epoch=dataset_size//mini_batch_size + 1, - epochs=max_epochs-self.epoch, # if we load a checkpoint, we have already trained for self.epoch - pct_start=0.0, - cycle_momentum=cycle_momentum) + max_lr=learning_rate, + steps_per_epoch=dataset_size // mini_batch_size + 1, + epochs=max_epochs - self.epoch, + # if we load a checkpoint, we have already trained for self.epoch + pct_start=0.0, + cycle_momentum=cycle_momentum) else: lr_scheduler = scheduler( optimizer, @@ -256,7 +257,7 @@ def train( mode=anneal_mode, verbose=True, ) - + if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing): raise ValueError("Batch growth with OneCycle policy is not implemented.") @@ -280,10 +281,6 @@ def train( sampler.set_dataset(train_data) shuffle = False - if not isinstance(save_model_epoch_step, int) or save_model_epoch_step < 1: - log.warning(f'save_model_epoch_step should be positive integer, not {save_model_epoch_step}. It will be set to None') - save_model_epoch_step = None - dev_score_history = [] dev_loss_history = [] train_loss_history = [] @@ -321,9 +318,9 @@ def train( # reload last best model if annealing with restarts is enabled if ( - (anneal_with_restarts or anneal_with_prestarts) - and learning_rate != previous_learning_rate - and (base_path / "best-model.pt").exists() + (anneal_with_restarts or anneal_with_prestarts) + and learning_rate != previous_learning_rate + and (base_path / "best-model.pt").exists() ): if anneal_with_restarts: log.info("resetting to best model") @@ -348,7 +345,7 @@ def train( batch_loader = DataLoader( train_data, batch_size=mini_batch_size, - shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch + shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch num_workers=num_workers, sampler=sampler, ) @@ -376,7 +373,7 @@ def train( batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [ - batch[x : x + micro_batch_size] + batch[x: x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] @@ -396,7 +393,7 @@ def train( # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() - + # do the scheduler step if one-cycle if isinstance(lr_scheduler, OneCycleLR): lr_scheduler.step() @@ -404,7 +401,7 @@ def train( for group in optimizer.param_groups: learning_rate = group["lr"] if "momentum" in group: - momentum = group["momentum"] + momentum = group["momentum"] seen_batches += 1 train_loss += loss.item() @@ -590,11 +587,11 @@ def train( # if we use dev data, remember best model based on dev evaluation score if ( - (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) - and not param_selection_mode - and not isinstance(lr_scheduler, OneCycleLR) - and current_score == lr_scheduler.best - and bad_epochs == 0 + (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) + and not param_selection_mode + and not isinstance(lr_scheduler, OneCycleLR) + and current_score == lr_scheduler.best + and bad_epochs == 0 ): print("saving best model") self.model.save(base_path / "best-model.pt") @@ -604,8 +601,8 @@ def train( self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) - - if save_model_at_each_epoch or save_model_epoch_step is not None and not self.epoch % save_model_epoch_step: + + if save_model_each_k_epochs > 0 and not self.epoch % save_model_each_k_epochs: print("saving model of current epoch") model_name = "model_epoch_" + str(self.epoch) + ".pt" self.model.save(base_path / model_name) @@ -658,7 +655,7 @@ def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus): return model def final_test( - self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8 + self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8 ): if type(base_path) is str: base_path = Path(base_path) @@ -705,16 +702,16 @@ def final_test( return final_score def find_learning_rate( - self, - base_path: Union[Path, str], - file_name: str = "learning_rate.tsv", - start_learning_rate: float = 1e-7, - end_learning_rate: float = 10, - iterations: int = 100, - mini_batch_size: int = 32, - stop_early: bool = True, - smoothing_factor: float = 0.98, - **kwargs, + self, + base_path: Union[Path, str], + file_name: str = "learning_rate.tsv", + start_learning_rate: float = 1e-7, + end_learning_rate: float = 10, + iterations: int = 100, + mini_batch_size: int = 32, + stop_early: bool = True, + smoothing_factor: float = 0.98, + **kwargs, ) -> Path: best_loss = None moving_avg_loss = 0 @@ -765,11 +762,11 @@ def find_learning_rate( else: if smoothing_factor > 0: moving_avg_loss = ( - smoothing_factor * moving_avg_loss - + (1 - smoothing_factor) * loss_item + smoothing_factor * moving_avg_loss + + (1 - smoothing_factor) * loss_item ) loss_item = moving_avg_loss / ( - 1 - smoothing_factor ** (step + 1) + 1 - smoothing_factor ** (step + 1) ) if loss_item < best_loss: best_loss = loss From ad2e1ed28f266b5c6d712a37920afeda85da0594 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sat, 13 Mar 2021 16:28:36 +0100 Subject: [PATCH 4/4] GH-2148: use -DOCSTARTS- to separate documents --- flair/datasets/sequence_labeling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 4b83c6d40c..5c274db765 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -3526,7 +3526,6 @@ def __init__( self, base_path: Union[str, Path] = None, in_memory: bool = True, - document_as_sequence: bool = False, **corpusargs, ): """ @@ -3643,7 +3642,7 @@ def __init__( train_file=corpus_file_name, column_delimiter="\t", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + document_separator_token="-DOCSTART-", **corpusargs, )