From 770050e4bd36c5f6624e6fedbcb58f84d22b488c Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sat, 13 Mar 2021 16:24:01 +0100
Subject: [PATCH 1/4] add text pair dataset

---
 flair/datasets/document_classification.py | 86 ++++++++++++++---------
 1 file changed, 52 insertions(+), 34 deletions(-)

diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index adeb7f2c21..c5b3cce3cc 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -11,7 +11,7 @@
     Corpus,
     Token,
     FlairDataset,
-    Tokenizer
+    Tokenizer, DataPair
 )
 from flair.tokenization import SegtokTokenizer, SpaceTokenizer
 from flair.datasets.base import find_train_dev_test_files
@@ -454,9 +454,12 @@ def __init__(
 
         # most data sets have the token text in the first column, if not, pass 'text' as column
         self.text_columns: List[int] = []
+        self.pair_columns: List[int] = []
         for column in column_name_map:
             if column_name_map[column] == "text":
                 self.text_columns.append(column)
+            if column_name_map[column] == "pair":
+                self.pair_columns.append(column)
 
         with open(self.path_to_file, encoding=encoding) as csv_file:
 
@@ -488,26 +491,8 @@ def __init__(
 
                 if self.in_memory:
 
-                    text = " ".join(
-                        [row[text_column] for text_column in self.text_columns]
-                    )
-
-                    if self.max_chars_per_doc > 0:
-                        text = text[: self.max_chars_per_doc]
-
-                    sentence = Sentence(text, use_tokenizer=self.tokenizer)
-
-                    for column in self.column_name_map:
-                        column_value = row[column]
-                        if (
-                                self.column_name_map[column].startswith("label")
-                                and column_value
-                        ):
-                            if column_value != self.no_class_label:
-                                sentence.add_label(label_type, column_value)
+                    sentence = self._make_labeled_data_point(row)
 
-                    if 0 < self.max_tokens_per_doc < len(sentence):
-                        sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
                     self.sentences.append(sentence)
 
                 else:
@@ -515,6 +500,52 @@ def __init__(
 
                 self.total_sentence_count += 1
 
+    def _make_labeled_data_point(self, row):
+
+        # make sentence from text (and filter for length)
+        text = " ".join(
+            [row[text_column] for text_column in self.text_columns]
+        )
+
+        if self.max_chars_per_doc > 0:
+            text = text[: self.max_chars_per_doc]
+
+        sentence = Sentence(text, use_tokenizer=self.tokenizer)
+
+        if 0 < self.max_tokens_per_doc < len(sentence):
+            sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
+
+        # if a pair column is defined, make a sentence pair object
+        if len(self.pair_columns) > 0:
+
+            text = " ".join(
+                [row[pair_column] for pair_column in self.pair_columns]
+            )
+
+            if self.max_chars_per_doc > 0:
+                text = text[: self.max_chars_per_doc]
+
+            pair = Sentence(text, use_tokenizer=self.tokenizer)
+
+            if 0 < self.max_tokens_per_doc < len(sentence):
+                pair.tokens = pair.tokens[: self.max_tokens_per_doc]
+
+            data_point = DataPair(first=sentence, second=pair)
+
+        else:
+            data_point = sentence
+
+        for column in self.column_name_map:
+            column_value = row[column]
+            if (
+                    self.column_name_map[column].startswith("label")
+                    and column_value
+            ):
+                if column_value != self.no_class_label:
+                    data_point.add_label(self.label_type, column_value)
+
+        return data_point
+
     def is_in_memory(self) -> bool:
         return self.in_memory
 
@@ -527,20 +558,7 @@ def __getitem__(self, index: int = 0) -> Sentence:
         else:
             row = self.raw_data[index]
 
-            text = " ".join([row[text_column] for text_column in self.text_columns])
-
-            if self.max_chars_per_doc > 0:
-                text = text[: self.max_chars_per_doc]
-
-            sentence = Sentence(text, use_tokenizer=self.tokenizer)
-            for column in self.column_name_map:
-                column_value = row[column]
-                if self.column_name_map[column].startswith("label") and column_value:
-                    if column_value != self.no_class_label:
-                        sentence.add_label(self.label_type, column_value)
-
-            if 0 < self.max_tokens_per_doc < len(sentence):
-                sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
+            sentence = self._make_labeled_data_point(row)
 
             return sentence
 

From bf79c803fabd54a7e60ea2ea25032d9616fae2aa Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sat, 13 Mar 2021 16:24:40 +0100
Subject: [PATCH 2/4] GH-2148: offset DOCSTARTS as single sentences

---
 flair/datasets/sequence_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 2b5ff9ae47..4b83c6d40c 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -3571,7 +3571,7 @@ def __init__(
 
                     for row in posts: # Go through all the post titles
 
-                        txtout.writelines("-DOCSTART-\n") # Start each post with a -DOCSTART- token
+                        txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token
 
                         # Keep track of how many and which entity mentions does a given post title have
                         link_annots = [] # [start pos, end pos, wiki page title] of an entity mention

From 4d81634689d9da8d9b4d508f021604f8210130e1 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sat, 13 Mar 2021 16:25:24 +0100
Subject: [PATCH 3/4] GH-2146: collapse saving parameters into one

---
 flair/trainers/trainer.py | 157 +++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 80 deletions(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index a21a38a897..021ac53633 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -39,12 +39,12 @@
 
 class ModelTrainer:
     def __init__(
-        self,
-        model: flair.nn.Model,
-        corpus: Corpus,
-        optimizer: torch.optim.Optimizer = SGD,
-        epoch: int = 0,
-        use_tensorboard: bool = False,
+            self,
+            model: flair.nn.Model,
+            corpus: Corpus,
+            optimizer: torch.optim.Optimizer = SGD,
+            epoch: int = 0,
+            use_tensorboard: bool = False,
     ):
         """
         Initialize a model trainer
@@ -61,40 +61,39 @@ def __init__(
         self.use_tensorboard: bool = use_tensorboard
 
     def train(
-        self,
-        base_path: Union[Path, str],
-        learning_rate: float = 0.1,
-        mini_batch_size: int = 32,
-        mini_batch_chunk_size: int = None,
-        max_epochs: int = 100,
-        scheduler = AnnealOnPlateau,
-        cycle_momentum: bool = False,
-        anneal_factor: float = 0.5,
-        patience: int = 3,
-        initial_extra_patience = 0,
-        min_learning_rate: float = 0.0001,
-        train_with_dev: bool = False,
-        train_with_test: bool = False,
-        monitor_train: bool = False,
-        monitor_test: bool = False,
-        embeddings_storage_mode: str = "cpu",
-        checkpoint: bool = False,
-        save_final_model: bool = True,
-        anneal_with_restarts: bool = False,
-        anneal_with_prestarts: bool = False,
-        batch_growth_annealing: bool = False,
-        shuffle: bool = True,
-        param_selection_mode: bool = False,
-        write_weights: bool = False,
-        num_workers: int = 6,
-        sampler=None,
-        use_amp: bool = False,
-        amp_opt_level: str = "O1",
-        eval_on_train_fraction=0.0,
-        eval_on_train_shuffle=False,
-        save_model_at_each_epoch=False,
-        save_model_epoch_step: int = None,
-        **kwargs,
+            self,
+            base_path: Union[Path, str],
+            learning_rate: float = 0.1,
+            mini_batch_size: int = 32,
+            mini_batch_chunk_size: int = None,
+            max_epochs: int = 100,
+            scheduler=AnnealOnPlateau,
+            cycle_momentum: bool = False,
+            anneal_factor: float = 0.5,
+            patience: int = 3,
+            initial_extra_patience=0,
+            min_learning_rate: float = 0.0001,
+            train_with_dev: bool = False,
+            train_with_test: bool = False,
+            monitor_train: bool = False,
+            monitor_test: bool = False,
+            embeddings_storage_mode: str = "cpu",
+            checkpoint: bool = False,
+            save_final_model: bool = True,
+            anneal_with_restarts: bool = False,
+            anneal_with_prestarts: bool = False,
+            batch_growth_annealing: bool = False,
+            shuffle: bool = True,
+            param_selection_mode: bool = False,
+            write_weights: bool = False,
+            num_workers: int = 6,
+            sampler=None,
+            use_amp: bool = False,
+            amp_opt_level: str = "O1",
+            eval_on_train_fraction=0.0,
+            eval_on_train_shuffle=False,
+            save_model_each_k_epochs: int = 0,
+            **kwargs,
     ) -> dict:
         """
         Trains any class that implements the flair.nn.Model interface.
@@ -127,7 +126,8 @@ def train(
         if 'dev' the size is determined from dev set size
         :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training
         and kept fixed during training, otherwise it's sampled at beginning of each epoch
-        :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved
+        :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
+        be saved each 5 epochs. Default is 0 which means no model saving.
         :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
         :param kwargs: Other arguments for the Optimizer
         :return:
@@ -236,17 +236,18 @@ def train(
 
         # minimize training loss if training with dev data, else maximize dev score
         anneal_mode = "min" if train_with_dev else "max"
-        
+
         if scheduler == OneCycleLR:
             dataset_size = len(self.corpus.train)
             if train_with_dev:
                 dataset_size += len(self.corpus.dev)
             lr_scheduler = OneCycleLR(optimizer,
-                                   max_lr=learning_rate,
-                                   steps_per_epoch=dataset_size//mini_batch_size + 1,
-                                   epochs=max_epochs-self.epoch, # if we load a checkpoint, we have already trained for self.epoch
-                                   pct_start=0.0,
-                                   cycle_momentum=cycle_momentum)
+                                      max_lr=learning_rate,
+                                      steps_per_epoch=dataset_size // mini_batch_size + 1,
+                                      epochs=max_epochs - self.epoch,
+                                      # if we load a checkpoint, we have already trained for self.epoch
+                                      pct_start=0.0,
+                                      cycle_momentum=cycle_momentum)
         else:
             lr_scheduler = scheduler(
                 optimizer,
@@ -256,7 +257,7 @@ def train(
                 mode=anneal_mode,
                 verbose=True,
             )
-        
+
         if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing):
             raise ValueError("Batch growth with OneCycle policy is not implemented.")
 
@@ -280,10 +281,6 @@ def train(
             sampler.set_dataset(train_data)
             shuffle = False
 
-        if not isinstance(save_model_epoch_step, int) or save_model_epoch_step < 1:
-            log.warning(f'save_model_epoch_step should be positive integer, not {save_model_epoch_step}. It will be set to None')
-            save_model_epoch_step = None
-
         dev_score_history = []
         dev_loss_history = []
         train_loss_history = []
@@ -321,9 +318,9 @@ def train(
 
                 # reload last best model if annealing with restarts is enabled
                 if (
-                    (anneal_with_restarts or anneal_with_prestarts)
-                    and learning_rate != previous_learning_rate
-                    and (base_path / "best-model.pt").exists()
+                        (anneal_with_restarts or anneal_with_prestarts)
+                        and learning_rate != previous_learning_rate
+                        and (base_path / "best-model.pt").exists()
                 ):
                     if anneal_with_restarts:
                         log.info("resetting to best model")
@@ -348,7 +345,7 @@ def train(
                 batch_loader = DataLoader(
                     train_data,
                     batch_size=mini_batch_size,
-                    shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch
+                    shuffle=shuffle if self.epoch > 1 else False,  # never shuffle the first epoch
                     num_workers=num_workers,
                     sampler=sampler,
                 )
@@ -376,7 +373,7 @@ def train(
                     batch_steps = [batch]
                     if len(batch) > micro_batch_size:
                         batch_steps = [
-                            batch[x : x + micro_batch_size]
+                            batch[x: x + micro_batch_size]
                             for x in range(0, len(batch), micro_batch_size)
                         ]
 
@@ -396,7 +393,7 @@ def train(
                     # do the optimizer step
                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
                     optimizer.step()
-                    
+
                     # do the scheduler step if one-cycle
                     if isinstance(lr_scheduler, OneCycleLR):
                         lr_scheduler.step()
@@ -404,7 +401,7 @@ def train(
                         for group in optimizer.param_groups:
                             learning_rate = group["lr"]
                             if "momentum" in group:
-                                momentum = group["momentum"]                    
+                                momentum = group["momentum"]
 
                     seen_batches += 1
                     train_loss += loss.item()
@@ -590,11 +587,11 @@ def train(
 
                 # if we use dev data, remember best model based on dev evaluation score
                 if (
-                    (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
-                    and not param_selection_mode
-                    and not isinstance(lr_scheduler, OneCycleLR)
-                    and current_score == lr_scheduler.best
-                    and bad_epochs == 0
+                        (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
+                        and not param_selection_mode
+                        and not isinstance(lr_scheduler, OneCycleLR)
+                        and current_score == lr_scheduler.best
+                        and bad_epochs == 0
                 ):
                     print("saving best model")
                     self.model.save(base_path / "best-model.pt")
@@ -604,8 +601,8 @@ def train(
                         self.model.load_state_dict(last_epoch_model_state_dict)
                         self.model.save(base_path / "pre-best-model.pt")
                         self.model.load_state_dict(current_state_dict)
-                        
-                if save_model_at_each_epoch or save_model_epoch_step is not None and not self.epoch % save_model_epoch_step:
+
+                if save_model_each_k_epochs > 0 and not self.epoch % save_model_each_k_epochs:
                     print("saving model of current epoch")
                     model_name = "model_epoch_" + str(self.epoch) + ".pt"
                     self.model.save(base_path / model_name)
@@ -658,7 +655,7 @@ def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus):
         return model
 
     def final_test(
-        self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8
+            self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8
     ):
         if type(base_path) is str:
             base_path = Path(base_path)
@@ -705,16 +702,16 @@ def final_test(
         return final_score
 
     def find_learning_rate(
-        self,
-        base_path: Union[Path, str],
-        file_name: str = "learning_rate.tsv",
-        start_learning_rate: float = 1e-7,
-        end_learning_rate: float = 10,
-        iterations: int = 100,
-        mini_batch_size: int = 32,
-        stop_early: bool = True,
-        smoothing_factor: float = 0.98,
-        **kwargs,
+            self,
+            base_path: Union[Path, str],
+            file_name: str = "learning_rate.tsv",
+            start_learning_rate: float = 1e-7,
+            end_learning_rate: float = 10,
+            iterations: int = 100,
+            mini_batch_size: int = 32,
+            stop_early: bool = True,
+            smoothing_factor: float = 0.98,
+            **kwargs,
     ) -> Path:
         best_loss = None
         moving_avg_loss = 0
@@ -765,11 +762,11 @@ def find_learning_rate(
                 else:
                     if smoothing_factor > 0:
                         moving_avg_loss = (
-                            smoothing_factor * moving_avg_loss
-                            + (1 - smoothing_factor) * loss_item
+                                smoothing_factor * moving_avg_loss
+                                + (1 - smoothing_factor) * loss_item
                         )
                         loss_item = moving_avg_loss / (
-                            1 - smoothing_factor ** (step + 1)
+                                1 - smoothing_factor ** (step + 1)
                         )
                     if loss_item < best_loss:
                         best_loss = loss

From ad2e1ed28f266b5c6d712a37920afeda85da0594 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sat, 13 Mar 2021 16:28:36 +0100
Subject: [PATCH 4/4] GH-2148: use -DOCSTARTS- to separate documents

---
 flair/datasets/sequence_labeling.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 4b83c6d40c..5c274db765 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -3526,7 +3526,6 @@ def __init__(
             self,
             base_path: Union[str, Path] = None,
             in_memory: bool = True,
-            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
@@ -3643,7 +3642,7 @@ def __init__(
             train_file=corpus_file_name,
             column_delimiter="\t",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            document_separator_token="-DOCSTART-",
             **corpusargs,
         )