farm/data_handler/processor.py

import abc
import inspect
import json
import logging
import os
import random
from abc import ABC
from inspect import signature
from pathlib import Path
from random import randint

import numpy as np
from numpy.random import random as random_float
from sklearn.preprocessing import StandardScaler
from tokenizers import Encoding
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import AutoConfig

from farm.data_handler.dataset import convert_features_to_dataset
from farm.data_handler.input_features import sample_to_features_text
from farm.data_handler.nq_utils import (
    sample_to_features_qa_Natural_Questions,
    create_samples_qa_Natural_Question,
    convert_qa_input_dict,
)
from farm.data_handler.samples import (
    Sample,
    SampleBasket,
    get_passage_offsets,
    offset_to_token_idx_vecorized
)
from farm.data_handler.utils import (
    expand_labels,
    read_tsv,
    read_tsv_sentence_pair,
    read_docs_from_txt,
    read_ner_file,
    read_squad_file,
    read_jsonl,
    read_dpr_json,
    is_json,
    get_sentence_pair,
    split_with_metadata,
)
from farm.modeling.tokenization import (
    Tokenizer,
    tokenize_with_metadata,
    truncate_sequences,
    tokenize_batch_question_answering,
    _get_start_of_word
)
from farm.utils import MLFlowLogger as MlLogger
from farm.utils import try_get

ID_NAMES = ["example_id", "external_id", "doc_id", "id"]


logger = logging.getLogger(__name__)


class Processor(ABC):
    """
    Is used to generate PyTorch Datasets from input data. An implementation of this abstract class should be created
    for each new data source.
    Implement the abstract methods: file_to_dicts(), _dict_to_samples(), _sample_to_features()
    to be compatible with your data format
    """

    subclasses = {}

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        train_filename,
        dev_filename,
        test_filename,
        dev_split,
        data_dir,
        tasks={},
        proxies=None,
        multithreading_rust=True,
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: The name of the file containing test data.
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param data_dir: The directory in which the train, test and perhaps dev files can be found.
        :type data_dir: str
        :param tasks: Tasks for which the processor shall extract labels from the input data.
                      Usually this includes a single, default task, e.g. text classification.
                      In a multitask setting this includes multiple tasks, e.g. 2x text classification.
                      The task name will be used to connect with the related PredictionHead.
        :type tasks: dict
        :param proxies: proxy configuration to allow downloads of remote datasets.
                    Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
                                    Note: Enabling multithreading in Rust AND multiprocessing in python might cause
                                    deadlocks.
        :type multithreading_rust: bool
        """
        if not multithreading_rust:
            os.environ["RAYON_RS_NUM_CPUS"] = "1"

        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.tasks = tasks
        self.proxies = proxies

        # data sets
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.dev_split = dev_split
        if data_dir:
            self.data_dir = Path(data_dir)
        else:
            self.data_dir = None
        self.baskets = []

        self._log_params()
        self.problematic_sample_ids = set()

    def __init_subclass__(cls, **kwargs):
        """ This automatically keeps track of all available subclasses.
        Enables generic load() and load_from_dir() for all specific Processor implementation.
        """
        super().__init_subclass__(**kwargs)
        cls.subclasses[cls.__name__] = cls

    @classmethod
    def load(
        cls,
        processor_name,
        data_dir,
        tokenizer,
        max_seq_len,
        train_filename,
        dev_filename,
        test_filename,
        dev_split,
        **kwargs,
    ):
        """
        Loads the class of processor specified by processor name.

        :param processor_name: The class of processor to be loaded.
        :type processor_name: str
        :param data_dir: Directory where data files are located.
        :type data_dir: str
        :param tokenizer: A tokenizer object
        :param max_seq_len: Sequences longer than this will be truncated.
        :type max_seq_len: int
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data.
                             If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: The name of the file containing test data.
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced.
                          Only works if dev_filename is set to None
        :type dev_split: float
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        :return: An instance of the specified processor.
        """

        sig = signature(cls.subclasses[processor_name])
        unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters}
        logger.debug(
            f"Got more parameters than needed for loading {processor_name}: {unused_args}. "
            f"Those won't be used!"
        )
        processor = cls.subclasses[processor_name](
            data_dir=data_dir,
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            **kwargs,
        )

        return processor

    @classmethod
    def load_from_dir(cls, load_dir):
        """
         Infers the specific type of Processor from a config file (e.g. GNADProcessor) and loads an instance of it.

        :param load_dir: str, directory that contains a 'processor_config.json'
        :return: An instance of a Processor Subclass (e.g. GNADProcessor)
        """
        # read config
        processor_config_file = Path(load_dir) / "processor_config.json"
        config = json.load(open(processor_config_file))
        config["inference"] = True
        # init tokenizer
        if "lower_case" in config.keys():
            logger.warning("Loading tokenizer from deprecated FARM config. "
                           "If you used `custom_vocab` or `never_split_chars`, this won't work anymore.")
            tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"])
        else:
            tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])

        # we have to delete the tokenizer string from config, because we pass it as Object
        del config["tokenizer"]

        processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config)

        for task_name, task in config["tasks"].items():
            processor.add_task(name=task_name,
                               metric=task["metric"],
                               label_list=task["label_list"],
                               label_column_name=task["label_column_name"],
                               text_column_name=task.get("text_column_name", None),
                               task_type=task["task_type"])

        if processor is None:
            raise Exception

        return processor

    @classmethod
    def convert_from_transformers(cls, tokenizer_name_or_path, task_type, max_seq_len, doc_stride,
                                  revision=None, tokenizer_class=None, tokenizer_args=None, use_fast=True, **kwargs):
        config = AutoConfig.from_pretrained(tokenizer_name_or_path, revision=revision, **kwargs)
        tokenizer_args = tokenizer_args or {}
        tokenizer = Tokenizer.load(tokenizer_name_or_path,
                                   tokenizer_class=tokenizer_class,
                                   use_fast=use_fast,
                                   revision=revision,
                                   **tokenizer_args,
                                   **kwargs
                                   )

        # TODO infer task_type automatically from config (if possible)
        if task_type == "question_answering":
            processor = SquadProcessor(
                tokenizer=tokenizer,
                max_seq_len=max_seq_len,
                label_list=["start_token", "end_token"],
                metric="squad",
                data_dir="data",
                doc_stride=doc_stride
            )
        elif task_type == "embeddings":
            processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len)

        elif task_type == "text_classification":
            label_list = list(config.id2label[id] for id in range(len(config.id2label)))
            processor = TextClassificationProcessor(tokenizer=tokenizer,
                                                    max_seq_len=max_seq_len,
                                                    data_dir="data",
                                                    label_list=label_list,
                                                    label_column_name="label",
                                                    metric="acc",
                                                    quote_char='"',
                                                    )
        elif task_type == "ner":
            label_list = list(config.id2label.values())
            processor = NERProcessor(
                tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", metric="seq_f1",
                label_list=label_list
            )
        else:
            raise ValueError(f"`task_type` {task_type} is not supported yet. "
                             f"Valid options for arg `task_type`: 'question_answering', "
                             f"'embeddings', 'text_classification', 'ner'")

        return processor

    def save(self, save_dir):
        """
        Saves the vocabulary to file and also creates a json file containing all the
        information needed to load the same processor.

        :param save_dir: Directory where the files are to be saved
        :type save_dir: str
        """
        os.makedirs(save_dir, exist_ok=True)
        config = self.generate_config()
        # save tokenizer incl. attributes
        config["tokenizer"] = self.tokenizer.__class__.__name__

        # Because the fast tokenizers expect a str and not Path
        # always convert Path to str here.
        self.tokenizer.save_pretrained(str(save_dir))

        # save processor
        config["processor"] = self.__class__.__name__
        output_config_file = Path(save_dir) / "processor_config.json"
        with open(output_config_file, "w") as file:
            json.dump(config, file)

    def generate_config(self):
        """
        Generates config file from Class and instance attributes (only for sensible config parameters).
        """
        config = {}
        # self.__dict__ doesn't give parent class attributes
        for key, value in inspect.getmembers(self):
            if is_json(value) and key[0] != "_":
                if issubclass(type(value), Path):
                    value = str(value)
                config[key] = value
        return config

    def add_task(self, name,  metric, label_list, label_column_name=None,
                 label_name=None, task_type=None, text_column_name=None):
        if type(label_list) is not list:
            raise ValueError(f"Argument `label_list` must be of type list. Got: f{type(label_list)}")

        if label_name is None:
            label_name = f"{name}_label"
        label_tensor_name = label_name + "_ids"
        self.tasks[name] = {
            "label_list": label_list,
            "metric": metric,
            "label_tensor_name": label_tensor_name,
            "label_name": label_name,
            "label_column_name": label_column_name,
            "text_column_name": text_column_name,
            "task_type": task_type
        }

    @abc.abstractmethod
    def file_to_dicts(self, file: str) -> [dict]:
        raise NotImplementedError()

    def _dict_to_samples(cls, dictionary: dict, all_dicts=None) -> [Sample]:
        raise NotImplementedError()

    def _sample_to_features(cls, sample: Sample) -> dict:
        raise NotImplementedError()

    def _dict_to_samples_and_features(self, dictionary: dict, all_dicts=None) -> [Sample]:
        raise NotImplementedError()

    def _init_samples_in_baskets(self):
        all_dicts = [b.raw for b in self.baskets]
        for basket in self.baskets:
            try:
                basket.samples = self._dict_to_samples(dictionary=basket.raw, all_dicts=all_dicts)
                for num, sample in enumerate(basket.samples):
                     sample.id = f"{basket.id_internal}-{num}"
            except Exception as e:
                logger.error(f"Could not create sample(s) from this dict: \n {basket.raw}")
                logger.error(f"Error message: {e}")

    def _featurize_samples(self):
        curr_problematic_sample_ids = []
        for basket in self.baskets:
            for sample in basket.samples:
                try:
                    sample.features = self._sample_to_features(sample=sample)
                except Exception as e:
                    curr_problematic_sample_ids.append(sample.id)
        if curr_problematic_sample_ids:
            self.problematic_sample_ids.update(curr_problematic_sample_ids)

    @staticmethod
    def log_problematic(problematic_sample_ids):
        if problematic_sample_ids:
            n_problematic = len(problematic_sample_ids)
            problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids])
            logger.error(
                f"Unable to convert {n_problematic} samples to features. Their ids are : {problematic_id_str}")


    def _init_and_featurize_samples_in_baskets(self):
        for basket in self.baskets:
            all_dicts = [b.raw for b in self.baskets]
            try:
                basket.samples = self._dict_to_samples_and_features(dictionary=basket.raw,
                                                                    all_dicts=all_dicts,
                                                                    basket_id_internal=basket.id_internal)
                for num, sample in enumerate(basket.samples):
                    sample.id = f"{basket.id_internal}-{num}"
            except Exception as e:
                logger.error(f"Could not create sample(s) from this dict: \n {basket.raw}")
                logger.error(f"Error message: {e}")


    @staticmethod
    def _check_sample_features(basket):
        """Check if all samples in the basket has computed its features.

        Args:
            basket: the basket containing the samples

        Returns:
            True if all the samples in the basket has computed its features, False otherwise

        """
        if len(basket.samples) == 0:
            return False
        for sample in basket.samples:
            if sample.features is None:
                return False
        return True

    def _create_dataset(self):
        features_flat = []
        basket_to_remove = []
        for basket in self.baskets:
            if self._check_sample_features(basket):
                for sample in basket.samples:
                    features_flat.extend(sample.features)
            else:
                # remove the entire basket
                basket_to_remove.append(basket)
        if len(basket_to_remove) > 0:
            for basket in basket_to_remove:
                # if basket_to_remove is not empty remove the related baskets
                self.baskets.remove(basket)

        dataset, tensor_names = convert_features_to_dataset(features=features_flat)
        return dataset, tensor_names

    def dataset_from_dicts(self, dicts, indices=None, return_baskets = False):
        """
        Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a
        list of tensor names. This can be used for inference mode.

        :param dicts: List of dictionaries where each contains the data of one input sample.
        :type dicts: list of dicts
        :return: a Pytorch dataset and a list of tensor names.
        """
        # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID

        self.baskets = []
        for id_internal, d in enumerate(dicts):
            id_external = self._id_from_dict(d)
            if indices:
                id_internal = indices[id_internal]
            self.baskets.append(SampleBasket(raw=d, id_external=id_external, id_internal=id_internal))

        self._init_samples_in_baskets()
        self._featurize_samples()
        if indices:
            if 0 in indices:
                self._log_samples(1)
        else:
            self._log_samples(1)

        dataset, tensor_names = self._create_dataset()
        # This mode is for inference where we need to keep baskets
        if return_baskets:
            #TODO simplify
            return dataset, tensor_names, self.problematic_sample_ids, self.baskets
        # This mode is for training where we can free ram by removing baskets
        else:
            return dataset, tensor_names, self.problematic_sample_ids

    def _log_samples(self, n_samples):
        logger.info("*** Show {} random examples ***".format(n_samples))
        for i in range(n_samples):
            random_basket = random.choice(self.baskets)
            random_sample = random.choice(random_basket.samples)
            logger.info(random_sample)

    def _log_params(self):
        params = {
            "processor": self.__class__.__name__,
            "tokenizer": self.tokenizer.__class__.__name__,
        }
        names = ["max_seq_len", "dev_split"]
        for name in names:
            value = getattr(self, name)
            params.update({name: str(value)})
        MlLogger.log_params(params)

    @staticmethod
    def _id_from_dict(d):
        ext_id = try_get(ID_NAMES, d)
        if not ext_id and "qas" in d:
            ext_id = try_get(ID_NAMES, d["qas"][0])
        return ext_id


class TextClassificationProcessor(Processor):
    """
    Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)
    """
    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        dev_stratification=False,
        delimiter="\t",
        quote_char="'",
        skiprows=None,
        label_column_name="label",
        multilabel=False,
        header=0,
        proxies=None,
        max_samples=None,
        text_column_name="text",
        **kwargs
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
                 For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param dev_stratification: if True, create a class-stratified split for the dev set.
        :type dev_stratification: bool
        :param delimiter: Separator used in the input tsv / csv file
        :type delimiter: str
        :param quote_char: Character used for quoting strings in the input tsv/ csv file
        :type quote_char: str
        :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers)
        :type skiprows: int
        :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
        :type label_column_name: str
        :param multilabel: set to True for multilabel classification
        :type multilabel: bool
        :param header: which line to use as a header in the input csv/tsv
        :type  header: int
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param text_column_name: name of the column in the input csv/tsv that shall be used as training text
        :type text_column_name: str
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs

        # Custom processor attributes
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header
        self.max_samples = max_samples
        self.dev_stratification = dev_stratification
        logger.warning(f"Currently no support in Processor for returning problematic ids")

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,

        )
        if metric and label_list:
            if multilabel:
                task_type = "multilabel_classification"
            else:
                task_type = "classification"
            self.add_task(name="text_classification",
                          metric=metric,
                          label_list=label_list,
                          label_column_name=label_column_name,
                          text_column_name=text_column_name,
                          task_type=task_type)
        else:
            logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
                        "using the default task or add a custom task later via processor.add_task()")

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {}
        for task in self.tasks.values():
            column_mapping[task["label_column_name"]] = task["label_name"]
            column_mapping[task["text_column_name"]] = "text"
        dicts = read_tsv(
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            quotechar=self.quote_char,
            rename_columns=column_mapping,
            header=self.header,
            proxies=self.proxies,
            max_samples=self.max_samples
            )

        return dicts

    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False):
        self.baskets = []
        # Tokenize in batches
        texts = [x["text"] for x in dicts]
        tokenized_batch = self.tokenizer.batch_encode_plus(
            texts,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            max_length=self.max_seq_len,
            padding="max_length"
        )
        input_ids_batch = tokenized_batch["input_ids"]
        segment_ids_batch = tokenized_batch["token_type_ids"]
        padding_masks_batch = tokenized_batch["attention_mask"]
        tokens_batch = [x.tokens for x in tokenized_batch.encodings]

        # From here we operate on a per sample basis
        for dictionary, input_ids, segment_ids, padding_mask, tokens in zip(
                dicts, input_ids_batch, segment_ids_batch, padding_masks_batch, tokens_batch
        ):

            tokenized = {}
            if debug:
                tokenized["tokens"] = tokens

            feat_dict = {"input_ids": input_ids,
                         "padding_mask": padding_mask,
                         "segment_ids": segment_ids}

            # Create labels
            # i.e. not inference
            if not return_baskets:
                label_dict = self.convert_labels(dictionary)
                feat_dict.update(label_dict)

            # Add Basket to self.baskets
            curr_sample = Sample(id=None,
                                 clear_text=dictionary,
                                 tokenized=tokenized,
                                 features=[feat_dict])
            curr_basket = SampleBasket(id_internal=None,
                                       raw=dictionary,
                                       id_external=None,
                                       samples=[curr_sample])
            self.baskets.append(curr_basket)

        if indices and 0 not in indices:
            pass
        else:
            self._log_samples(1)

        # TODO populate problematic ids
        problematic_ids = set()
        dataset, tensornames = self._create_dataset()
        if return_baskets:
            return dataset, tensornames, problematic_ids, self.baskets
        else:
            return dataset, tensornames, problematic_ids


    def convert_labels(self, dictionary):
        ret = {}
        # Add labels for different tasks
        for task_name, task in self.tasks.items():
            label_name = task["label_name"]
            label_raw = dictionary[label_name]
            label_list = task["label_list"]
            if task["task_type"] == "classification":
                # id of label
                label_ids = [label_list.index(label_raw)]
            elif task["task_type"] == "multilabel_classification":
                # multi-hot-format
                label_ids = [0] * len(label_list)
                for l in label_raw.split(","):
                    if l != "":
                        label_ids[label_list.index(l)] = 1
            ret[task["label_tensor_name"]] = label_ids
        return ret


    def _create_dataset(self):
        # TODO this is the proposed new version to replace the mother function
        features_flat = []
        basket_to_remove = []
        for basket in self.baskets:
            if self._check_sample_features(basket):
                for sample in basket.samples:
                    features_flat.extend(sample.features)
            else:
                # remove the entire basket
                basket_to_remove.append(basket)
        dataset, tensor_names = convert_features_to_dataset(features=features_flat)
        return dataset, tensor_names


class TextPairClassificationProcessor(TextClassificationProcessor):
    """
    Used to handle text pair classification datasets (e.g. Answer Selection or Natural Inference) that come in
    tsv format. The columns should be called text, text_b and label.
    """
    def __init__(self, **kwargs):
        super(TextPairClassificationProcessor, self).__init__(**kwargs)

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {}
        for task in self.tasks.values():
            column_mapping[task["label_column_name"]] = task["label_name"]

        dicts = read_tsv_sentence_pair(
            rename_columns=column_mapping,
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            proxies=self.proxies,
        )


        # during tokenization inside TextClassificationProcessor only the "text" field is tokenized
        # To convert text pairs we combine text and text_b into a tuple, then the tokenizer takes care of
        # adding separator tokens and correct segment IDs
        for d in dicts:
            d["text"] = (d["text"], d["text_b"])
        return dicts


class RegressionProcessor(TextClassificationProcessor):
    """
    Processor to handle a regression dataset in tab separated text + label,
    It uses the text conversion functionality of a TextClassificationProcessor
    but adds special label conversion (scaled float value as label)
    """
    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char="'",
        skiprows=None,
        label_column_name="label",
        label_name="regression_label",
        scaler_mean=None,
        scaler_scale=None,
        proxies=None,
        text_column_name="text",
        **kwargs
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a
                 numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. No devset is created if this is set to 0.0
            Only used if dev_filename is set to None
        :type dev_split: float
        :param delimiter: Separator used in the input tsv / csv file
        :type delimiter: str
        :param quote_char: Character used for quoting strings in the input tsv/ csv file
        :type quote_char: str
        :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers)
        :type skiprows: int
        :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
        :type label_column_name: str
        :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases)
        :type label_name: str
        :param scaler_mean: Value to substract from the label for normalization
        :type scaler_mean: float
        :param scaler_scale: Value to divide the label by for normalization
        :type scaler_scale: float
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param text_column_name: name of the column in the input csv/tsv that shall be used as training text
        :type text_column_name: str
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """

        super(RegressionProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            proxies=proxies,
            delimiter=delimiter,
            quote_char=quote_char,
            skiprows=skiprows,
        )

        # Note that label_list is being hijacked to store the scaling mean and scale
        self.add_task(name="regression",
                      metric="mse",
                      label_list=[scaler_mean, scaler_scale],
                      label_column_name=label_column_name,
                      task_type="regression",
                      label_name=label_name,
                      text_column_name=text_column_name)

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {}
        for task in self.tasks.values():
            column_mapping[task["label_column_name"]] = task["label_name"]
            column_mapping[task["text_column_name"]] = "text"

        dicts = read_tsv(
            rename_columns=column_mapping,
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            quotechar=self.quote_char,
            proxies=self.proxies
        )

        self._set_label_scaling(dicts=dicts, file=file)
        return dicts

    def convert_labels(self, dictionary: dict):
        # For regression the label should be scaled
        ret = {}
        for task_name, task in self.tasks.items():
            label_name = task["label_name"]
            label_raw = dictionary[label_name]
            label_list = task["label_list"]
            if task["task_type"] == "regression":
                label = float(label_raw)
                scaled_label = (label - label_list[0]) / label_list[1]
                ret[task["label_tensor_name"]] = [scaled_label]
        return ret

    def _set_label_scaling(self, dicts, file):
        # collect all labels and compute scaling stats
        # for regression with Mean Squared Error loss we need to have labels centered around 0 with unit variance
        train_labels = []
        if self.train_filename in str(file):
            for d in dicts:
                train_labels.append(float(d[self.tasks["regression"]["label_name"]]))
            scaler = StandardScaler()
            scaler.fit(np.reshape(train_labels, (-1, 1)))
            # add to label list in regression task
            self.tasks["regression"]["label_list"] = [scaler.mean_.item(), scaler.scale_.item()]


class TextPairRegressionProcessor(RegressionProcessor):
    """
    Used to handle text pair regression datasets that come in tsv format.
    The columns should be called text, text_b and label.
    """

    def __init__(self, **kwargs):
        super(TextPairRegressionProcessor, self).__init__(**kwargs)

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {}
        for task in self.tasks.values():
            column_mapping[task["label_column_name"]] = task["label_name"]

        dicts = read_tsv_sentence_pair(
            rename_columns=column_mapping,
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            proxies=self.proxies,
        )

        self._set_label_scaling(dicts=dicts, file=file)

        # during tokenization inside RegressionProcessor only the "text" field is tokenized
        # To convert text pairs we combine text and text_b into a tuple, then the tokenizers takes care of
        # adding separator tokens and correct segment IDs
        for d in dicts:
            d["text"] = (d["text"], d["text_b"])
        return dicts


#########################################
# Processor for Basic Inference ####
#########################################
class InferenceProcessor(TextClassificationProcessor):
    """
    Generic processor used at inference time:
    - fast
    - no labels
    - pure encoding of text into pytorch dataset
    - Doesn't read from file, but only consumes dictionaries (e.g. coming from API requests)
    """

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        **kwargs,
    ):

        super(InferenceProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=None,
            dev_filename=None,
            test_filename=None,
            dev_split=None,
            data_dir=None,
            tasks={},
        )

    @classmethod
    def load_from_dir(cls, load_dir):
        """
         Overwriting method from parent class to **always** load the InferenceProcessor instead of the specific class stored in the config.

        :param load_dir: str, directory that contains a 'processor_config.json'
        :return: An instance of an InferenceProcessor
        """
        # read config
        processor_config_file = Path(load_dir) / "processor_config.json"
        config = json.load(open(processor_config_file))
        # init tokenizer
        tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
        # we have to delete the tokenizer string from config, because we pass it as Object
        del config["tokenizer"]

        processor = cls.load(tokenizer=tokenizer, processor_name="InferenceProcessor", **config)
        for task_name, task in config["tasks"].items():
            processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])

        if processor is None:
            raise Exception

        return processor

    def file_to_dicts(self, file: str) -> [dict]:
        raise NotImplementedError

    def convert_labels(self, dictionary: dict):
        # For inference we do not need labels
        ret = {}
        return ret

    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False):
        """
        Function to convert input dictionaries containing text into a torch dataset.
        For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method.
        For slow tokenizers, s3e or wordembedding tokenizers the function works on _dict_to_samples and _sample_to_features
        """
        # TODO remove this sections once tokenizers work the same way for slow/fast and our special tokenizers
        if not self.tokenizer.is_fast:
            self.baskets = []
            for d in dicts:
                sample = self._dict_to_samples(dictionary=d)
                features = self._sample_to_features(sample)
                sample.features = features
                basket = SampleBasket(id_internal=None,
                                      raw=d,
                                      id_external=None,
                                      samples=[sample])
                self.baskets.append(basket)
            if indices and 0 not in indices:
                pass
            else:
                self._log_samples(1)

            problematic_ids = set()
            dataset, tensornames = self._create_dataset()
            ret = [dataset, tensornames, problematic_ids]
            if return_baskets:
                ret.append(self.baskets)
            return ret
        else:
            return super().dataset_from_dicts(dicts=dicts,
                                       indices=indices,
                                       return_baskets=return_baskets,
                                       debug=debug)

    # Private method to keep s3e pooling and embedding extraction working
    def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
        # this tokenization also stores offsets
        tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
        # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
        for seq_name in tokenized.keys():
            tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
                                                           tokenizer=self.tokenizer,
                                                           max_seq_len=self.max_seq_len)
        return Sample(id=None, clear_text=dictionary, tokenized=tokenized)

    # Private method to keep s3e pooling and embedding extraction working
    def _sample_to_features(self, sample) -> dict:
        features = sample_to_features_text(
            sample=sample,
            tasks=self.tasks,
            max_seq_len=self.max_seq_len,
            tokenizer=self.tokenizer,
        )
        return features


#########################################
# Processors for NER data ####
#########################################
class NERProcessor(Processor):
    """
    Used to handle most NER datasets, like CoNLL or GermEval 2014
    """

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.txt",
        dev_filename="dev.txt",
        test_filename="test.txt",
        dev_split=0.0,
        delimiter="\t",
        proxies=None,
        **kwargs
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param label_list: list of labels to predict (strings).
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, e.g. "seq_f1".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
                 For using multiple metrics supply them as a list, e.g ["seq_f1", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param delimiter: Separator used in the input tsv / csv file. German version of Conll03 uses a whitespace. GermEval 2014 is tab separated \t
        :type delimiter: str
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        # Custom processor attributes
        self.delimiter = delimiter

        self.pre_tokenizer = WhitespaceSplit()

        super(NERProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )
        if metric and label_list:
            self.add_task("ner", metric, label_list)
        else:
            logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
                        "using the default task or add a custom task later via processor.add_task()")

    def file_to_dicts(self, file: str) -> [dict]:
        dicts = read_ner_file(filename=file, sep=self.delimiter,  proxies=self.proxies)
        return dicts

    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
        self.baskets = []

        # Perform batch tokenization
        texts = [x["text"] for x in dicts]
        words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
        words = [[x[0] for x in y] for y in words_and_spans]

        # word_spans_batch is the char span for each whitespace split word
        word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

        tokenized_batch = self.tokenizer.batch_encode_plus(
            words,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            max_length=self.max_seq_len,
            padding="max_length",
            is_split_into_words=True
        )

        # Create features by iterating over samples
        for i in range(len(dicts)):
            tokenized = tokenized_batch[i]
            d = dicts[i]

            # Either try to extract an ID from the dictionary, or else create an id
            # based on the order of the dictionaries coming in, taking into account
            # the indices generated by chunking and multiprocessing
            id_external = self._id_from_dict(d)
            if indices:
                id_internal = indices[i]
            else:
                id_internal = i

            input_ids = tokenized.ids
            segment_ids = tokenized.type_ids

            # We construct a mask to identify the first token of a word. We will later only use them for predicting entities.
            # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens
            # For BERT we add a 0 in the start and end (for CLS and SEP)
            initial_mask = self._get_start_of_word(tokenized.words)
            assert len(initial_mask) == len(input_ids)

            # This mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            padding_mask = tokenized.attention_mask

            # i.e. if inference, we need to populate the tokenized_dict so that formatted preds can align
            # the prediction to the original text
            if return_baskets:
                token_to_word_map = tokenized.words
                word_spans = word_spans_batch[i]
                tokenized_dict = {
                    "tokens": tokenized.tokens,
                    "word_spans": word_spans,
                    "token_to_word_map": token_to_word_map,
                    "start_of_word": initial_mask
                }
            else:
                tokenized_dict = {}

            feature_dict = {
                "input_ids": input_ids,
                "padding_mask": padding_mask,
                "segment_ids": segment_ids,
                "initial_mask": initial_mask,
            }

            for task_name, task in self.tasks.items():
                try:
                    label_list = task["label_list"]
                    label_name = task["label_name"]
                    label_tensor_name = task["label_tensor_name"]
                    labels_word = d[label_name]
                    labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                    label_ids = [label_list.index(lt) for lt in labels_token]
                except ValueError:
                    # Usually triggered if label is not in label list
                    label_ids = None
                    problematic_labels = set(labels_token).difference(set(label_list))
                    logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                   f"\nWe found a problem with labels {str(problematic_labels)}")
                # TODO change this when inference flag is implemented
                except KeyError:
                    # Usually triggered if there is no label in the sample
                    # This is expected during inference since there are no labels
                    # During training, this is a problem
                    label_ids = None
                    logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                   "\nIf your are running in *inference* mode: Don't worry!"
                                   "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")

                if label_ids:
                    feature_dict[label_tensor_name] = label_ids

            curr_sample = Sample(id=None,
                                 clear_text=d,
                                 tokenized=tokenized_dict,
                                 features=[feature_dict])
            curr_basket = SampleBasket(id_internal=id_internal,
                                       raw=d,
                                       id_external=id_external,
                                       samples=[curr_sample])
            self.baskets.append(curr_basket)

        # Don't log if we are processing a dataset chunk other than the first chunk
        if indices and 0 not in indices:
            pass
        else:
            self._log_samples(1)

        dataset, tensor_names = self._create_dataset()
        ret = [dataset, tensor_names, self.problematic_sample_ids]
        # This is for inference where we need to keep baskets
        # By contrast, in training, we can remove baskets to free up RAM
        if return_baskets:
            ret.append(self.baskets)
        return tuple(ret)

    @staticmethod
    def _get_start_of_word(word_ids):
        words = np.array(word_ids)
        words[words == None] = -1
        start_of_word_single = [0] + list(np.ediff1d(words) > 0)
        start_of_word_single = [int(x) for x in start_of_word_single]
        return start_of_word_single

#####################
# LM Processors ####
#####################
class BertStyleLMProcessor(Processor):
    """
    Prepares data for masked language model training and next sentence prediction in the style of BERT
    """

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        train_filename="train.txt",
        dev_filename="dev.txt",
        test_filename="test.txt",
        dev_split=0.0,
        next_sent_pred=True,
        next_sent_pred_style="bert-style",
        max_docs=None,
        proxies=None,
        masked_lm_prob=0.15,
        **kwargs
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
                 For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param next_sent_pred: Whether to use next_sentence_prediction objective or not
        :type next_sent_pred: bool
        :param next_sent_pred_style:
            Two different styles for next sentence prediction available:
                - "sentence":   Use of a single sentence for Sequence A and a single sentence for Sequence B
                - "bert-style": Fill up all of max_seq_len tokens and split into Sequence A and B at sentence border.
                                If there are too many tokens, Sequence B will be truncated.
        :type next_sent_pred_style: str
        :param max_docs: maximum number of documents to include from input dataset
        :type max_docs: int
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param masked_lm_prob: probability of masking a token
        :type masked_lm_prob: float
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """

        self.delimiter = ""
        self.max_docs = max_docs

        if not tokenizer.is_fast:
            raise ValueError("This processor only supports FastTokenizers. "
                             "Load one by calling Tokenizer.load(..., use_fast=True)")

        super(BertStyleLMProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )

        self.next_sent_pred = next_sent_pred
        self.next_sent_pred_style = next_sent_pred_style
        added_tokens = self.get_added_tokens()
        self.add_task("lm", "acc", list(self.tokenizer.vocab) + added_tokens)
        if self.next_sent_pred:
            self.add_task("nextsentence", "acc", ["False", "True"])
        self.masked_lm_prob = masked_lm_prob

    def get_added_tokens(self):
        dictionary = self.tokenizer.get_added_vocab()
        sorted_tuples = sorted(dictionary.items(), key=lambda x: x[0])
        return [x[1] for x in sorted_tuples]

    def file_to_dicts(self, file: str) -> list:
        dicts = read_docs_from_txt(filename=file, delimiter=self.delimiter, max_docs=self.max_docs, proxies=self.proxies)
        return dicts

    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False):
        dicts = [d["doc"] for d in dicts]
        # 1) Create samples & truncate (sentence pairs)
        # next sentence prediction ...
        if self.next_sent_pred:
            assert len(dicts) > 1, "Need at least 2 documents to sample random sentences from"
            # ...with single sentences
            if self.next_sent_pred_style == "sentence":
                samples = self._create_sequence_pairs_by_line(dicts)
            # ...bert style
            elif self.next_sent_pred_style == "bert-style":
                 samples = self._create_sequence_pairs_bert_style(dicts)
            else:
                raise NotImplementedError("next_sent_pred_style has to be 'sentence' or 'bert-style'")

        # no next sentence prediction
        else:
           samples = self._create_sequence_pairs_no_next_sent(dicts)

        # 2) Create labels (masking words + NSP)
        features = []
        vocab_length = len(self.tokenizer.vocab)-1
        for sample in samples:
            features.append(self._create_labels(sample=sample, vocab_length=vocab_length))

        # 3) Create dataset
        dataset, tensor_names = convert_features_to_dataset(features=features)
        return dataset, tensor_names, set()

    def _create_sequence_pairs_by_line(self, docs):
        samples = []
        raw_pairs = []
        labels = []
        for doc in docs:
            # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
            for idx in range(len(doc) - 1):
                text_a, text_b, is_next_label = get_sentence_pair(doc, docs, idx)
                raw_pairs.append((text_a, text_b))
                labels.append(is_next_label)

        # Tokenize + Encode masks
        encoded_pairs = self.tokenizer.batch_encode_plus(raw_pairs,
                                                         max_length=self.max_seq_len,
                                                         truncation=True,
                                                         truncation_strategy="longest_first",
                                                         add_special_tokens=True,
                                                         padding='max_length'
                                                         )

        assert len(encoded_pairs.input_ids) == len(raw_pairs)

        # Create "Start of word mask"
        start_of_word = []
        for e in encoded_pairs.encodings:
            start_of_word.append(_get_start_of_word(e.words, e.special_tokens_mask))

        # Create Sample objects
        for idx in range(len(raw_pairs)):
            if len(encoded_pairs.input_ids[idx]) == 0:
                logger.warning(
                    f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {raw_pairs[idx]}")
                continue

            # We don't populate 'tokenized' here as we skiped the intermediate string token stage abeoce to improve the speed ...
            samples.append(Sample(id=None,
                                  clear_text={"text_a": raw_pairs[idx][0],
                                              "text_b": raw_pairs[idx][1],
                                              "nextsentence_label": labels[idx]},
                                  tokenized={"tokens": encoded_pairs.encodings[idx].tokens,
                                             "start_of_word": start_of_word[idx],
                                             "special_tokens_mask": encoded_pairs.encodings[idx].special_tokens_mask,
                                             "offsets": encoded_pairs.encodings[idx].offsets},
                                  features={"input_ids": encoded_pairs.input_ids[idx],
                                            "segment_ids": encoded_pairs.token_type_ids[idx],
                                            "padding_mask": encoded_pairs.attention_mask[idx],
                                            }
                                  ))
        return samples

    def _create_sequence_pairs_bert_style(self, docs):
        samples = []

        # 1) Tokenize + Encode all docs
        # TODO optimize for single batch call
        encoded_docs = []
        for doc in docs:
            encoded_sentences = self.tokenizer.batch_encode_plus(doc, add_special_tokens=False)
            # Create "Start of word mask"
            for e in encoded_sentences.encodings:
                e.start_of_word = _get_start_of_word(e.words, e.special_tokens_mask)
            encoded_docs.append(encoded_sentences)

        # 2) Create sequence pairs that utilize full possible length up to max_seq_len
        # TODO make num special tokens more general
        # account for [CLS], [SEP], [SEP]
        max_num_tokens = self.max_seq_len - 3
        for enc_doc in encoded_docs:
            current_chunk = []
            current_length = 0
            i = 0
            while i < len(enc_doc.encodings):
                current_length += len(enc_doc[i].tokens)
                current_chunk.append(enc_doc[i])

                if current_length >= max_num_tokens:
                    # split our list of sequences (=chunk) into two sequences and create a sample out of it
                    # (incl. special tokens and all other masks)
                    sample, num_unused_segments = self._create_sample_bert_style(
                        chunk=current_chunk,
                        random_doc=encoded_docs[random.randint(0, len(encoded_docs)-1)],
                        max_num_tokens=max_num_tokens,
                    )
                    samples.append(sample)
                    i -= num_unused_segments

                    current_chunk = []
                    current_length = 0
                i += 1
        return samples

    def _create_sequence_pairs_no_next_sent(self, docs):
        samples = []
        # flatten into list of sentences
        docs = [sent for doc in docs for sent in doc]
        # Tokenize + Encode masks
        #TODO fill up sequences rather than creating one-sentence-samples to make this more efficient
        encoded_pairs = self.tokenizer.batch_encode_plus(docs,
                                                         max_length=self.max_seq_len,
                                                         truncation=True,
                                                         truncation_strategy="longest_first",
                                                         add_special_tokens=True,
                                                         padding='max_length'
                                                         )

        assert len(encoded_pairs.input_ids) == len(docs)

        # Create "Start of word mask"
        start_of_word = []
        for e in encoded_pairs.encodings:
            start_of_word.append(_get_start_of_word(e.words, e.special_tokens_mask))

        # Create Sample objects
        for idx in range(len(docs)):
            if len(encoded_pairs.input_ids[idx]) == 0:
                logger.warning(
                    f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {docs[idx]}")
                continue

            # We don't populate 'tokenized' here as we skiped the intermediate string token stage abeoce to improve the speed ...
            samples.append(Sample(id=None,
                                  clear_text={"text_a": docs[idx]},
                                  tokenized={"tokens": encoded_pairs.encodings[idx].tokens,
                                             "start_of_word": start_of_word[idx],
                                             "special_tokens_mask": encoded_pairs.encodings[idx].special_tokens_mask,
                                             "offsets": encoded_pairs.encodings[idx].offsets},
                                  features={"input_ids": encoded_pairs.input_ids[idx],
                                            "segment_ids": encoded_pairs.token_type_ids[idx],
                                            "padding_mask": encoded_pairs.attention_mask[idx],
                                            }
                                  ))
        return samples

    def _create_sample_bert_style(self, chunk, random_doc, max_num_tokens, prob_next_sentence=0.5):
        """
        Get one sample from corpus consisting of two sequences. A sequence can consist of more than one sentence.
        With prob. 50% these are two subsequent sequences from one doc. With 50% the second sequence will be a
        random one from another document.

        :param chunk: List of subsequent, tokenized and encoded sentences.
        :type chunk: [Encoding]
        :param random_doc: A random doc where we can sample a random next "sentence" from.
        :type random_doc: [str]
        :param max_num_tokens: Samples are truncated after this many tokens.
        :type max_num_tokens: int
        :return: (Sample, int)
            sample,
            number of unused sentences in chunk
        """
        # edge case: if we have only a single sequence, we split that one in half
        if len(chunk) == 1:
            # Define splitting point
            if int(len(chunk[0].tokens) / 2) >= max_num_tokens:
                boundary = int(max_num_tokens / 2)
            else:
                boundary = int(len(chunk[0].tokens) / 2)

            # Insert special tokens
            input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=chunk[0].ids[:boundary],
                                                                        token_ids_1=chunk[0].ids[
                                                                                    boundary:max_num_tokens])

            segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=chunk[0].ids[:boundary],
                                                                              token_ids_1=chunk[0].ids[
                                                                                          boundary:max_num_tokens])

            # TODO make this general for other model types
            start_of_word = [0] + chunk[0].start_of_word[:boundary] + [0] + chunk[0].start_of_word[boundary:max_num_tokens] + [0]
            padding_mask = [1] * len(input_ids)

            assert len(start_of_word) == len(input_ids)
            assert len(padding_mask) == len(input_ids)
            assert len(segment_ids) == len(input_ids)

            sample = Sample(id=None,
                            clear_text= {"text_a": None,
                                        "text_b": None,
                                        "nextsentence_label": True},
                            tokenized= {"start_of_word": start_of_word},
                            features= {"input_ids": input_ids,
                                        "segment_ids": segment_ids,
                                        "padding_mask": padding_mask,
                                        }
            )
            num_unused_segments = 0
            return sample, num_unused_segments
        else:
            # determine how many segments from chunk go into sequence A
            a_end = random.randrange(1, len(chunk))
            sequence_a = chunk[:a_end]
            length_a = sum([len(seq) for seq in sequence_a])

            # Build sequence B
            target_b_length = max_num_tokens - length_a
            # a) .. using actual next sequence
            if (random.random() > prob_next_sentence) and (len(chunk) > 1):
                sequence_b = chunk[a_end:]
                label = True
                num_unused_segments = 0

            # b) ... using random next sequence
            else:
                sequence_b = []
                length_b = 0
                if len(random_doc.encodings) == 1:
                    sequence_b.append(random_doc[0])
                else:
                    # pick random start sentence and then fill up to target length
                    random_start = random.randrange(len(random_doc.encodings)-1)
                    for i in range(random_start, len(random_doc.encodings)):
                        sequence_b.append(random_doc[i])
                        length_b += len(random_doc[i].ids)
                        if length_b >= target_b_length:
                            break

                label = False

                # We didn't use all of the segments in this chunk as we sampled a random sequence => put them back
                num_unused_segments = len(chunk) - a_end

            # Join everything to single sample
            def merge_start_of_word(sequences):
                start_of_word = []
                for s in sequences:
                    start_of_word.extend(s.start_of_word)
                return start_of_word

            start_of_word_a = merge_start_of_word(sequence_a)
            start_of_word_b = merge_start_of_word(sequence_b)

            sequence_a = Encoding.merge(sequence_a)
            sequence_b = Encoding.merge(sequence_b)

            assert len(sequence_a.ids) > 0
            assert len(sequence_b.ids) > 0

            # Insert special tokens
            input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=sequence_a.ids,
                                                                        token_ids_1=sequence_b.ids[:target_b_length])

            segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=sequence_a.ids,
                                                                              token_ids_1=sequence_b.ids[:target_b_length])

            # TODO make this general for other model types
            start_of_word = [0] + start_of_word_a + [0] + start_of_word_b[:target_b_length] + [0]
            padding_mask = [1] * len(input_ids)

            if len(input_ids) < self.max_seq_len:
                # Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1)
                pad_idx = self.tokenizer.pad_token_id
                padding = [pad_idx] * (self.max_seq_len - len(input_ids))
                zero_padding = [0] * (self.max_seq_len - len(input_ids))

                input_ids += padding
                padding_mask += zero_padding
                segment_ids += zero_padding
                start_of_word += zero_padding

            assert len(start_of_word) == len(input_ids)
            assert len(padding_mask) == len(input_ids)
            assert len(segment_ids) == len(input_ids)

            sample = Sample(id=None,
                            clear_text={"text_a": None,
                                        "text_b": None,
                                        "nextsentence_label": label},
                            tokenized={"start_of_word": start_of_word},
                            features={"input_ids": input_ids,
                                      "segment_ids": segment_ids,
                                      "padding_mask": padding_mask,
                                      }
                                )

        return sample, num_unused_segments

    def _create_labels(self, sample, vocab_length) -> dict:
        # Mask random words
        input_ids, lm_label_ids = self._mask_random_words(sample.features["input_ids"], vocab_length, token_groups=sample.tokenized["start_of_word"])
        sample.features["lm_label_ids"] = lm_label_ids
        sample.features["input_ids"] = input_ids

        # NSP label
        if self.next_sent_pred:
            # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true!
            if sample.clear_text["nextsentence_label"]:
                sample.features["nextsentence_label_ids"] = [0]
            else:
                sample.features["nextsentence_label_ids"] = [1]

        assert len(sample.features["input_ids"]) == self.max_seq_len
        assert len(sample.features["padding_mask"]) == self.max_seq_len
        assert len(sample.features["segment_ids"]) == self.max_seq_len
        assert len(sample.features["lm_label_ids"]) == self.max_seq_len

        return sample.features

    def _mask_random_words(self, tokens, vocab_length, token_groups=None, max_predictions_per_seq=20):
        """
        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
        num_masked.
        If token_groups is supplied, whole word masking is applied, so *all* tokens of a word are either masked or not.
        This option was added by the BERT authors later and showed solid improvements compared to the original objective.
        Whole Word Masking means that if we mask all of the wordpieces corresponding to an original word.
        When a word has been split intoWordPieces, the first token does not have any marker and any subsequence
        tokens are prefixed with ##. So whenever we see the ## token, we
        append it to the previous set of word indexes. Note that Whole Word Masking does *not* change the training code
        at all -- we still predict each WordPiece independently, softmaxed over the entire vocabulary.
        This implementation is mainly a copy from the original code by Google, but includes some simplifications.

        :param tokens: tokenized sentence.
        :type tokens: [str]
        :param vocab_length: number of tokens in the vocabulary
        :type vocab_length: int
        :param token_groups: If supplied, only whole groups of tokens get masked. This can be whole words but
        also other types (e.g. spans). Booleans indicate the start of a group.
        :type token_groups: [bool]
        :param max_predictions_per_seq: maximum number of masked tokens
        :type max_predictions_per_seq: int
        :return: (list of int, list of int), masked tokens and related labels for LM prediction
        """
        
        # 1. Combine tokens to one group (e.g. all subtokens of a word)

        # tokens can have different ids depending on the model
        pad_token_id = self.tokenizer.vocab["[PAD]"]
        cls_token_id = self.tokenizer.vocab["[SEP]"]
        sep_token_id = self.tokenizer.vocab["[CLS]"]
        mask_token_id = self.tokenizer.vocab["[MASK]"]

        cand_indices = []
        for (i, token) in enumerate(tokens):
            if token == cls_token_id or token == sep_token_id or token == pad_token_id: # CLS, SEP and PAD tokens
                continue
            if (token_groups and len(cand_indices) >= 1 and not token_groups[i]):
                cand_indices[-1].append(i)
            else:
                cand_indices.append([i])

        num_to_mask = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * self.masked_lm_prob ))))

        random.shuffle(cand_indices)

        output_label = [-1] * len(tokens)
        num_masked = 0
        assert mask_token_id not in tokens #mask token

        # 2. Mask the first groups until we reach the number of tokens we wanted to mask (num_to_mask)
        for index_set in cand_indices:
            if num_masked >= num_to_mask:
                break
            # If adding a whole-word mask would exceed the maximum number of
            # predictions, then just skip this candidate.
            if num_masked + len(index_set) > num_to_mask:
                continue

            for index in index_set:
                prob = random.random()
                num_masked += 1
                original_token = tokens[index]
                # 80% randomly change token to mask token
                if prob < 0.8:
                    tokens[index] = mask_token_id

                # 10% randomly change token to random token
                # TODO currently custom vocab is not included here
                elif prob < 0.9:
                    tokens[index] = random.randint(0, vocab_length)

                # -> rest 10% randomly keep current token

                # append current token to output (we will predict these later)
                try:
                    output_label[index] = original_token
                except KeyError:
                    # For unknown words (should not occur with BPE vocab)
                    output_label[index] = 100 # UNK token
                    logger.warning(
                        "Cannot find token '{}' in vocab. Using [UNK] instead".format(original_token)
                    )

        return tokens, output_label

    def estimate_n_samples(self, filepath, max_docs=500):
        """
        Estimates the number of samples from a given file BEFORE preprocessing.
        Used in StreamingDataSilo to estimate the number of steps before actually processing the data.
        The estimated number of steps will impact some types of Learning Rate Schedules.
        :param filepath: str or Path, file with data used to create samples (e.g. train.txt)
        :param max_docs: int, maximum number of docs to read in & use for our estimate of n_samples
        :return: int, number of samples in the given dataset
        """

        total_lines = sum(1 for line in open(filepath, encoding="utf-8"))
        empty_lines = sum(1 if line == "\n" else 0 for line in open(filepath, encoding="utf-8"))

        if self.next_sent_pred_style == "sentence":
            # one sample = two lines (except last line in doc)
            n_samples = total_lines - (2 * empty_lines)
        elif self.next_sent_pred_style == "bert-style":
            # Original BERT LM training (filling up sequence pairs with sentences until max_seq_len)
            # (This is a very rough heuristic, as we can only estimate the real number of samples AFTER tokenization)
            logging.info(f"Estimating total number of samples ...")
            # read in subset of docs
            if self.max_docs:
                temp = self.max_docs
                self.max_docs = min(max_docs, temp)
                dicts = list(self.file_to_dicts(filepath))
                self.max_docs = temp
            else:
                self.max_docs = max_docs
                dicts = list(self.file_to_dicts(filepath))
                self.max_docs = None
            # count samples
            dicts = [d["doc"] for d in dicts]
            n_samples = len(self._create_sequence_pairs_bert_style(docs=dicts))
            # extrapolate to the whole file
            n_samples = int(n_samples / len(dicts)) * (empty_lines+1)
            logging.info(f"Heuristic estimate of number of samples in {filepath} based on {len(dicts)} docs: {n_samples}")
        else:
            raise NotImplementedError(f"No estimate logic for next_sent_pred_style={self.next_sent_pred_style} implemented")
        return n_samples


#########################################
# QA Processors ####
#########################################
class SquadProcessor(Processor):
    """ Used to handle the SQuAD dataset"""

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric="squad",
        train_filename=Path("train-v2.0.json"),
        dev_filename=Path("dev-v2.0.json"),
        test_filename=None,
        dev_split=0,
        doc_stride=128,
        max_query_length=64,
        proxies=None,
        max_answers=6,
        **kwargs
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, can be "squad" or "top_n_accuracy"
        :type metric: str
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param doc_stride: When the document containing the answer is too long it gets split into part, strided by doc_stride
        :type doc_stride: int
        :param max_query_length: Maximum length of the question (in number of subword tokens)
        :type max_query_length: int
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """

        self.target = "classification"
        self.ph_output_type = "per_token_squad"

        assert doc_stride < (max_seq_len - max_query_length), \
            "doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps " \
            "as the passage windows slide, causing the model to skip over parts of the document.\n" \
            "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n " \
            "Or decrease max_query_length".format(doc_stride, max_seq_len, max_query_length)

        self.doc_stride = doc_stride
        self.max_query_length = max_query_length
        self.max_answers = max_answers
        super(SquadProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )
        self._initialize_special_tokens_count()
        if metric and label_list:
            self.add_task("question_answering", metric, label_list)
        else:
            logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
                        "using the default task or add a custom task later via processor.add_task()")

    def dataset_from_dicts(self, dicts, indices, return_baskets=False):
        """
        Convert input dictionaries into a pytorch dataset for Question Answering.
        For this we have an internal representation called "baskets".
        Each basket is a question-document pair.
        Each stage adds or transforms specific information to our baskets.

        :param dicts: dict, input dictionary with SQuAD style information present
        :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
        :param return_baskets: boolean, weather to return the baskets or not (baskets are needed during inference)
        """
        # Convert to standard format
        pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion

        # Tokenize documents and questions
        baskets = tokenize_batch_question_answering(pre_baskets, self.tokenizer, indices)

        # Split documents into smaller passages to fit max_seq_len
        baskets = self._split_docs_into_passages(baskets)

        # Convert answers from string to token space, skip this step for inference
        if not return_baskets:
            baskets = self._convert_answers(baskets)

        # Convert internal representation (nested baskets + samples with mixed types) to pytorch features (arrays of numbers)
        baskets = self._passages_to_pytorch_features(baskets, return_baskets)

        # Convert features into pytorch dataset, this step also removes potential errors during preprocessing
        dataset, tensor_names, baskets = self._create_dataset(baskets)

        # Logging
        if 0 in indices:
            self._log_samples(1, baskets)

        # During inference we need to keep the information contained in baskets.
        if return_baskets:
            return dataset, tensor_names, self.problematic_sample_ids, baskets
        else:
            return dataset, tensor_names, self.problematic_sample_ids


    def file_to_dicts(self, file: str) -> [dict]:
        nested_dicts = read_squad_file(filename=file)
        dicts = [y for x in nested_dicts for y in x["paragraphs"]]
        return dicts

    # TODO use Input Objects instead of this function
    def convert_qa_input_dict(self, infer_dict):
        """ Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
        ["text", "questions"] (api format). This function converts the latter into the former. It also converts the
        is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
        """
        # check again for doc stride vs max_seq_len when. Parameters can be changed for already initialized models (e.g. in haystack)
        assert self.doc_stride < (self.max_seq_len - self.max_query_length), \
            "doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps " \
            "as the passage windows slide, causing the model to skip over parts of the document.\n" \
            "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n " \
            "Or decrease max_query_length".format(self.doc_stride, self.max_seq_len, self.max_query_length)

        try:
            # Check if infer_dict is already in internal json format
            if "context" in infer_dict and "qas" in infer_dict:
                return infer_dict
            # converts dicts from inference mode to data structure used in FARM
            questions = infer_dict["questions"]
            text = infer_dict["text"]
            uid = infer_dict.get("id", None)
            qas = [{"question": q,
                    "id": uid,
                    "answers": [],
                    "answer_type": None} for i, q in enumerate(questions)]
            converted = {"qas": qas,
                         "context": text}
            return converted
        except KeyError:
            raise Exception("Input does not have the expected format")

    def _initialize_special_tokens_count(self):
        vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"],
                                                              token_ids_1=["b"])
        self.sp_toks_start = vec.index("a")
        self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1
        self.sp_toks_end = len(vec) - vec.index("b") - 1

    def _split_docs_into_passages(self, baskets):
        """
        Because of the sequence length limitation of Language Models, the documents need to be divided into smaller
        parts that we call passages.
        """
        n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
        for basket in baskets:
            samples = []
            ########## perform some basic checking
            # TODO, eventually move checking into input validation functions
            # ignore samples with empty context
            if basket.raw["document_text"] == "":
                logger.warning("Ignoring sample with empty context")
                continue
            ########## end checking


            # Calculate the number of tokens that can be reserved for the passage. This is calculated by considering
            # the max_seq_len, the number of tokens in the question and the number of special tokens that will be added
            # when the question and passage are joined (e.g. [CLS] and [SEP])
            passage_len_t = self.max_seq_len - len(basket.raw["question_tokens"][:self.max_query_length]) - n_special_tokens


            # passage_spans is a list of dictionaries where each defines the start and end of each passage
            # on both token and character level
            try:
                passage_spans = get_passage_offsets(basket.raw["document_offsets"],
                                                    self.doc_stride,
                                                    passage_len_t,
                                                    basket.raw["document_text"])
            except Exception as e:
                logger.warning(f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
                               f"With error: {e}")
                passage_spans = []

            for passage_span in passage_spans:
                # Unpack each variable in the dictionary. The "_t" and "_c" indicate
                # whether the index is on the token or character level
                passage_start_t = passage_span["passage_start_t"]
                passage_end_t = passage_span["passage_end_t"]
                passage_start_c = passage_span["passage_start_c"]
                passage_end_c = passage_span["passage_end_c"]

                passage_start_of_word = basket.raw["document_start_of_word"][passage_start_t: passage_end_t]
                passage_tokens = basket.raw["document_tokens"][passage_start_t: passage_end_t]
                passage_text = basket.raw["document_text"][passage_start_c: passage_end_c]

                clear_text = {"passage_text": passage_text,
                              "question_text": basket.raw["question_text"],
                              "passage_id": passage_span["passage_id"],
                              }
                tokenized = {"passage_start_t": passage_start_t,
                             "passage_start_c": passage_start_c,
                             "passage_tokens": passage_tokens,
                             "passage_start_of_word": passage_start_of_word,
                             "question_tokens": basket.raw["question_tokens"][:self.max_query_length],
                             "question_offsets": basket.raw["question_offsets"][:self.max_query_length],
                             "question_start_of_word": basket.raw["question_start_of_word"][:self.max_query_length],
                             }
                # The sample ID consists of internal_id and a passage numbering
                sample_id = f"{basket.id_internal}-{passage_span['passage_id']}"
                samples.append(Sample(id=sample_id,
                                      clear_text=clear_text,
                                      tokenized=tokenized))


            basket.samples=samples

        return baskets

    def _convert_answers(self, baskets):
        """
        Converts answers that are pure strings into the token based representation with start and end token offset.
        Can handle multiple answers per question document pair as is common for development/text sets
        """
        for basket in baskets:
            error_in_answer = False
            for num, sample in enumerate(basket.samples):
                # Dealing with potentially multiple answers (e.g. Squad dev set)
                # Initializing a numpy array of shape (max_answers, 2), filled with -1 for missing values
                label_idxs = np.full((self.max_answers, 2), fill_value=-1)

                if error_in_answer or (len(basket.raw["answers"]) == 0):
                    # If there are no answers we set
                    label_idxs[0, :] = 0
                else:
                    # For all other cases we use start and end token indices, that are relative to the passage
                    for i, answer in enumerate(basket.raw["answers"]):
                        # Calculate start and end relative to document
                        answer_len_c = len(answer["text"])
                        answer_start_c = answer["answer_start"]
                        answer_end_c = answer_start_c + answer_len_c - 1

                        # Convert character offsets to token offsets on document level
                        answer_start_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_start_c)
                        answer_end_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_end_c)
                        # TODO remove after testing 'offset_to_token_idx_vecorized()'
                        # answer_start_t2 = offset_to_token_idx(doc_offsets, answer_start_c)
                        # answer_end_t2 = offset_to_token_idx(doc_offsets, answer_end_c)
                        # if (answer_start_t != answer_start_t2) or (answer_end_t != answer_end_t2):
                        #     pass

                        # Adjust token offsets to be relative to the passage
                        answer_start_t -= sample.tokenized["passage_start_t"]
                        answer_end_t -= sample.tokenized["passage_start_t"]

                        # Initialize some basic variables
                        question_len_t = len(sample.tokenized["question_tokens"])
                        passage_len_t = len(sample.tokenized["passage_tokens"])

                        # Check that start and end are contained within this passage
                        # answer_end_t is 0 if the first token is the answer
                        # answer_end_t is passage_len_t if the last token is the answer
                        if passage_len_t > answer_start_t >= 0 and passage_len_t >= answer_end_t >= 0:
                            # Then adjust the start and end offsets by adding question and special token
                            label_idxs[i][0] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_start_t
                            label_idxs[i][1] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_end_t
                        # If the start or end of the span answer is outside the passage, treat passage as no_answer
                        else:
                            label_idxs[i][0] = 0
                            label_idxs[i][1] = 0

                        ########## answer checking ##############################
                        # TODO, move this checking into input validation functions and delete wrong examples there
                        # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
                        if answer_start_t < 0 or answer_end_t >= passage_len_t:
                            pass
                        else:
                            doc_text = basket.raw["document_text"]
                            answer_indices = doc_text[answer_start_c: answer_end_c + 1]
                            answer_text = answer["text"]
                            # check if answer string can be found in context
                            if answer_text not in doc_text:
                                logger.warning(f"Answer '{answer['text']}' not contained in context.\n"
                                               f"Example will not be converted for training/evaluation.")
                                error_in_answer = True
                                label_idxs[i][0] = -100  # TODO remove this hack also from featurization
                                label_idxs[i][1] = -100
                                break  # Break loop around answers, so the error message is not shown multiple times
                            elif answer_indices.strip() != answer_text.strip():
                                logger.warning(f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
                                               f"Example will not be converted for training/evaluation.")
                                error_in_answer = True
                                label_idxs[i][0] = -100 # TODO remove this hack also from featurization
                                label_idxs[i][1] = -100
                                break # Break loop around answers, so the error message is not shown multiple times
                        ########## end of checking ####################

                sample.tokenized["labels"] = label_idxs

        return baskets

    def _passages_to_pytorch_features(self, baskets, return_baskets):
        """
        Convert internal representation (nested baskets + samples with mixed types) to python features (arrays of numbers).
        We first join question and passages into on large vector.
        Then we add additional vectors for: - #TODO
        """
        for basket in baskets:
            # Add features to samples
            for num, sample in enumerate(basket.samples):
                # Initialize some basic variables
                question_tokens = sample.tokenized["question_tokens"]
                question_start_of_word = sample.tokenized["question_start_of_word"]
                question_len_t = len(question_tokens)
                passage_start_t = sample.tokenized["passage_start_t"]
                passage_tokens = sample.tokenized["passage_tokens"]
                passage_start_of_word = sample.tokenized["passage_start_of_word"]
                passage_len_t = len(passage_tokens)
                sample_id = [int(x) for x in sample.id.split("-")]

                # - Combines question_tokens and passage_tokens into a single vector called input_ids
                # - input_ids also contains special tokens (e.g. CLS or SEP tokens).
                # - It will have length = question_len_t + passage_len_t + n_special_tokens. This may be less than
                #   max_seq_len but never greater since truncation was already performed when the document was chunked into passages
                question_input_ids = sample.tokenized["question_tokens"]
                passage_input_ids = sample.tokenized["passage_tokens"]

                input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=question_input_ids,
                                                                       token_ids_1=passage_input_ids)

                segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=question_input_ids,
                                                                             token_ids_1=passage_input_ids)
                # To make the start index of passage tokens the start manually
                seq_2_start_t = self.sp_toks_start + question_len_t + self.sp_toks_mid

                start_of_word = [0] * self.sp_toks_start + \
                                    question_start_of_word + \
                                    [0] * self.sp_toks_mid + \
                                    passage_start_of_word + \
                                    [0] * self.sp_toks_end

                # The mask has 1 for real tokens and 0 for padding tokens. Only real
                # tokens are attended to.
                padding_mask = [1] * len(input_ids)

                # The passage mask has 1 for tokens that are valid start or ends for QA spans.
                # 0s are assigned to question tokens, mid special tokens, end special tokens and padding
                # Note that start special tokens are assigned 1 since they can be chosen for a no_answer prediction
                span_mask = [1] * self.sp_toks_start
                span_mask += [0] * question_len_t
                span_mask += [0] * self.sp_toks_mid
                span_mask += [1] * passage_len_t
                span_mask += [0] * self.sp_toks_end

                # Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1)
                pad_idx = self.tokenizer.pad_token_id
                padding = [pad_idx] * (self.max_seq_len - len(input_ids))
                zero_padding = [0] * (self.max_seq_len - len(input_ids))

                input_ids += padding
                padding_mask += zero_padding
                segment_ids += zero_padding
                start_of_word += zero_padding
                span_mask += zero_padding

                # TODO possibly remove these checks after input validation is in place
                len_check = len(input_ids) == len(padding_mask) == len(segment_ids) == len(start_of_word) == len(span_mask)
                id_check = len(sample_id) == 3
                label_check = return_baskets or len(sample.tokenized.get("labels",[])) == self.max_answers
                label_check2 = return_baskets or np.all(sample.tokenized["labels"] > -99) # labels are set to -100 when answer cannot be found
                if len_check and id_check and label_check and label_check2:
                    # - The first of the labels will be used in train, and the full array will be used in eval.
                    # - start_of_word and spec_tok_mask are not actually needed by model.forward() but are needed for
                    #   model.formatted_preds() during inference for creating answer strings
                    # - passage_start_t is index of passage's first token relative to document
                    feature_dict = {"input_ids": input_ids,
                                    "padding_mask": padding_mask,
                                    "segment_ids": segment_ids,
                                    "passage_start_t": passage_start_t,
                                    "start_of_word": start_of_word,
                                    "labels": sample.tokenized.get("labels",[]),
                                    "id": sample_id,
                                    "seq_2_start_t": seq_2_start_t,
                                    "span_mask": span_mask}
                    sample.features = [feature_dict] # other processor's features can be lists
                else:
                    self.problematic_sample_ids.add(sample.id)
                    sample.features = None
        return baskets

    def _create_dataset(self, baskets):
        """
        Convert python features into pytorch dataset.
        Also removes potential errors during preprocessing.
        Flattens nested basket structure to create a flat list of features
        """
        features_flat = []
        basket_to_remove = []
        for basket in baskets:
            if self._check_sample_features(basket):
                for sample in basket.samples:
                    features_flat.extend(sample.features)
            else:
                # remove the entire basket
                basket_to_remove.append(basket)
        if len(basket_to_remove) > 0:
            for basket in basket_to_remove:
                # if basket_to_remove is not empty remove the related baskets
                baskets.remove(basket)

        dataset, tensor_names = convert_features_to_dataset(features=features_flat)
        return dataset, tensor_names, baskets

    def _log_samples(self, n_samples, baskets):
        logger.info("*** Show {} random examples ***".format(n_samples))
        for i in range(n_samples):
            random_basket = random.choice(baskets)
            random_sample = random.choice(random_basket.samples)
            logger.info(random_sample)


class NaturalQuestionsProcessor(Processor):
    """ Used to handle the Natural Question QA dataset"""

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        train_filename=Path("train-v2.0.json"),
        dev_filename=Path("dev-v2.0.json"),
        test_filename=None,
        dev_split=0,
        doc_stride=128,
        max_query_length=64,
        proxies=None,
        keep_no_answer=0.02,
        downsample_context_size=None,
        inference=False,
        max_answers=6,
        **kwargs):
        """
        Deals with all the preprocessing steps needed for Natural Questions. Follows Alberti 2019 et al. (https://arxiv.org/abs/1901.08634)
        in merging multiple disjoint short answers into the one longer label span and also by downsampling
        samples of no_answer during training

        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: The name of the file containing the test data.
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param doc_stride: When the document containing the answer is too long it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param max_query_length: Maximum length of the question (in number of subword tokens)
        :type max_query_length: int
        :param keep_no_answer: The probability that a sample with an no_answer label is kept
                                    (0.0 < keep_no_answer <= 1.0). Only works if inference is False
        :type keep_no_answer: float
        :param downsample_context_size: Downsampling before any data conversion by taking a short text window of size
                                        downsample_context_size around the long answer span. To disable set to None
        :type downsample_context_size: int
        :param inference: Whether we are currently using the Processsor for model inference. If True, the
                          keep_no_answer will be overridden and set to 1
        :type inference: bool
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        self.target = "classification"
        self.ph_output_type = "per_token_squad"

        # These are classification labels from Natural Questions. Note that in this implementation, we are merging
        # the "long_answer" and "short_answer" labels into the one "span" label
        self.answer_type_list = ["no_answer", "span", "yes", "no"]

        self.doc_stride = doc_stride
        self.max_query_length = max_query_length
        self.keep_no_answer = keep_no_answer
        self.downsample_context_size = downsample_context_size
        self.inference = inference
        self.max_answers = max_answers

        super(NaturalQuestionsProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )

        # Todo rename metric from squad to maybe QA spans or something like that
        self.add_task("question_answering", "squad", ["start_token", "end_token"])
        self.add_task("text_classification", "f1_macro", self.answer_type_list, label_name="answer_type")
        self._initialize_special_tokens_count()

    def _initialize_special_tokens_count(self):
        vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"],
                                                              token_ids_1=["b"])
        self.sp_toks_start = vec.index("a")
        self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1
        self.sp_toks_end = len(vec) - vec.index("b") - 1

    def file_to_dicts(self, file: str) -> [dict]:
        dicts = read_jsonl(file, proxies=self.proxies)
        return dicts

    def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]:
        """
            This method will split question-document pairs from the SampleBasket into question-passage pairs which will
        each form one sample. The "t" and "c" in variables stand for token and character respectively. This uses many
        methods that the SquadProcessor calls but note that the SquadProcessor overwrites Processor._dicts_to_baskets()
        while the NaturalQuestionsProcessor does not. This was done in Squad to avoid retokenizing documents that are
        paired with multiple questions. This is not necessary for Natural Questions since there is generally a 1 to 1
        mapping from document to question. Input dictionaries can have either ["context", "qas"] (internal format) as
        keys or ["text", "questions"] (api format). Both are supported.
        """
        # Turns NQ dictionaries into a SQuAD style dictionaries
        if self._is_nq_dict(dictionary):
            dictionary = self._prepare_dict(dictionary=dictionary)

        dictionary_tokenized = self._apply_tokenization(dictionary, self.tokenizer, self.answer_type_list)[0]
        n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
        samples = create_samples_qa_Natural_Question(dictionary_tokenized,
                                    self.max_query_length,
                                    self.max_seq_len,
                                    self.doc_stride,
                                    n_special_tokens)
        # Downsample the number of samples with an no_answer label. This fn will always return at least one sample
        # so that we don't end up with a basket with 0 samples
        if not self.inference:
            samples = self._downsample(samples, self.keep_no_answer)
        return samples

    @staticmethod
    def _is_nq_dict(dictionary):
        if set(dictionary.keys()) == {'document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'}:
            return True
        return False

    def _downsample(self, samples, keep_prob):
        # Downsamples samples with a no_answer label (since there is an overrepresentation of these in NQ)
        # This method will always return at least one sample. This is done so that we don't end up with SampleBaskets
        # with 0 samples
        ret = []
        for s in samples:
            if self._check_no_answer_sample(s):
                if random_float() > 1 - keep_prob:
                    ret.append(s)
            else:
                ret.append(s)
        if len(ret) == 0:
            ret = [random.choice(samples)]
        return ret

    def _downsample_unprocessed(self, dictionary):
        doc_text = dictionary["document_text"]
        doc_tokens = doc_text.split(" ")
        annotations = dictionary.get("annotations",[])
        # for simplicity we only downsample wiki pages with one long answer annotation
        if len(annotations) == 1:
            annotation = annotations[0]
            # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer
            if self._check_no_answer(annotation):
                dictionary["document_text"] = " ".join(doc_tokens[:self.max_seq_len+randint(1,self.downsample_context_size)])
            else:
                # finding earliest start and latest end labels
                long_answer_start = annotation['long_answer']['start_token']
                long_answer_end = annotation['long_answer']['end_token']
                short_answer_start = 1e10
                short_answer_end = -1
                for s in annotation["short_answers"]:
                    if s["start_token"] < short_answer_start:
                        short_answer_start = s["start_token"]
                    if s["end_token"] > short_answer_end:
                        short_answer_end = s["end_token"]

                start_threshold = min(long_answer_start,short_answer_start) - randint(1,self.downsample_context_size)
                start_threshold = max(0, start_threshold)
                end_threshold = max(long_answer_end,short_answer_end) + randint(1,self.downsample_context_size)

                # taking subset of doc text and shift labels
                sub_document_text = " ".join(
                    doc_tokens[start_threshold:end_threshold]
                )
                dictionary["document_text"] = sub_document_text
                # change of offsets happens in place (of dictionary)
                annotation['long_answer']['start_token'] -= start_threshold
                annotation['long_answer']['end_token'] -= start_threshold
                for s in annotation["short_answers"]:
                    s["start_token"] -= start_threshold
                    s["end_token"] -= start_threshold

        return dictionary


    def _prepare_dict(self, dictionary):
        """ Casts a Natural Questions dictionary that is loaded from a jsonl file into SQuAD format so that
        the same featurization functions can be called for both tasks. Each annotation can be one of four answer types,
        ["yes", "no", "span", "no_answer"]"""

        if self.downsample_context_size is not None:
            dictionary = self._downsample_unprocessed(dictionary)

        converted_answers = []
        doc_text = dictionary["document_text"]
        _, tok_to_ch = split_with_metadata(doc_text)
        for annotation in dictionary["annotations"]:
            # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer
            if self._check_no_answer(annotation):
                continue
            sa_text, sa_start_c = self._unify_short_answers(annotation["short_answers"], doc_text, tok_to_ch)
            la_text, la_start_c = self._retrieve_long_answer(annotation["long_answer"]["start_token"],
                                                             annotation["long_answer"]["end_token"],
                                                             tok_to_ch,
                                                             doc_text)
            # Picks the span to be considered as annotation by choosing between short answer, long answer and no_answer
            text, start_c = self._choose_span(sa_text, sa_start_c, la_text, la_start_c)
            converted_answers.append({"text": text,
                                      "answer_start": start_c})
        if len(converted_answers) == 0:
            answer_type = "no_answer"
        else:
            answer_type = dictionary["annotations"][0]["yes_no_answer"].lower()
            if answer_type == "none":
                answer_type = "span"
        # TODO: answer_type should be in answers since in NQ, each annotator can give either a span, no_answer, yes or no
        converted = {"id": dictionary["example_id"],
                     "context": doc_text,
                     "qas": [{"question": dictionary["question_text"],
                              "id": dictionary["example_id"],
                              "answers": converted_answers,
                              "answer_type": answer_type}]}
        return converted

    @staticmethod
    def _check_no_answer(annotation):
        if annotation["long_answer"]["start_token"] > -1 or annotation["long_answer"]["end_token"] > -1:
            return False
        for sa in annotation["short_answers"]:
            if sa["start_token"] > -1 or sa["end_token"] > -1:
                return False
        else:
            return True

    @staticmethod
    def _check_no_answer_sample(sample):
        sample_tok = sample.tokenized
        if len(sample_tok["answers"]) == 0:
            return True
        first_answer = sample_tok["answers"][0]
        if first_answer["start_t"] < sample_tok["passage_start_t"]:
            return True
        if first_answer["end_t"] > sample_tok["passage_start_t"] + len(sample_tok["passage_tokens"]):
            return True
        if first_answer["answer_type"] == "no_answer":
            return True
        else:
            return False

    def _retrieve_long_answer(self, start_t, end_t, tok_to_ch, doc_text):
        """ Retrieves the string long answer and also its starting character index"""
        start_c, end_c = self._convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text)
        text = doc_text[start_c: end_c]
        return text, start_c

    @staticmethod
    def _choose_span(sa_text, sa_start_c, la_text, la_start_c):
        if sa_text:
            return sa_text, sa_start_c
        elif la_text:
            return la_text, la_start_c
        else:
            return "", -1

    def _unify_short_answers(self, short_answers, doc_text, tok_to_ch):
        """ In cases where an NQ sample has multiple disjoint short answers, this fn generates the single shortest
        span that contains all the answers"""
        if not short_answers:
            return "", -1
        short_answer_idxs = []
        # TODO write comment explaining this
        for short_answer in short_answers:
            short_answer_idxs.append(short_answer["start_token"])
            short_answer_idxs.append(short_answer["end_token"])
        answer_start_t = min(short_answer_idxs)
        answer_end_t = max(short_answer_idxs)
        answer_start_c, answer_end_c = self._convert_tok_to_ch(answer_start_t, answer_end_t, tok_to_ch, doc_text)
        answer_text = doc_text[answer_start_c: answer_end_c]
        assert answer_text == " ".join(doc_text.split()[answer_start_t: answer_end_t])
        return answer_text, answer_start_c

    @staticmethod
    def _convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
        n_tokens = len(tok_to_ch)
        if start_t == -1 and end_t == -1:
            return -1, -1
        start_c = tok_to_ch[start_t]
        # when the end of the answer span is the end of the text
        if end_t == n_tokens:
            end_c = len(doc_text)
        else:
            next_word_start_c = tok_to_ch[end_t]
            span = doc_text[:next_word_start_c].strip()
            end_c = len(span)
        return start_c, end_c

    def _sample_to_features(self, sample: Sample) -> dict:
        self._check_valid_answer(sample)
        features = sample_to_features_qa_Natural_Questions(sample=sample,
                                         tokenizer=self.tokenizer,
                                         max_seq_len=self.max_seq_len,
                                         sp_toks_start=self.sp_toks_start,
                                         sp_toks_mid=self.sp_toks_mid,
                                         sp_toks_end=self.sp_toks_end,
                                         answer_type_list=self.answer_type_list,
                                         max_answers=self.max_answers)
        return features

    def _check_valid_answer(self, sample):
        passage_text = sample.clear_text["passage_text"]
        for answer in sample.clear_text["answers"]:
            len_passage = len(passage_text)
            start = answer["start_c"]
            end = answer["end_c"]
            # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
            if start < 0 or end >= len_passage:
                continue
            answer_indices = passage_text[start: end + 1]
            answer_text = answer["text"]
            if answer_indices != answer_text:
                raise ValueError(
                    f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'""")

    def _dict_to_samples_and_features(self, dictionary: dict, **kwargs) -> [Sample]:
        """
            This method will split the question-document pair from the dictionary into question-passage pairs which will
        each form one sample. The "t" and "c" in variables stand for token and character respectively.
        Input dictionaries can have either ["context", "qas"] (internal format) as keys or ["text", "questions"]
        (api format). Both are supported.
        """
        if self._is_nq_dict(dictionary):
            dictionary = self._prepare_dict(dictionary=dictionary)
        basket_id_internal = kwargs["basket_id_internal"]

        dictionary_tokenized = self._apply_tokenization(dictionary, self.tokenizer, self.answer_type_list)[0]
        n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
        samples = create_samples_qa_Natural_Question(dictionary_tokenized,
                                    self.max_query_length,
                                    self.max_seq_len,
                                    self.doc_stride,
                                    n_special_tokens)
        # Downsample the number of samples with an no_answer label. This fn will always return at least one sample
        # so that we don't end up with a basket with 0 samples.
        if not self.inference:
            samples = self._downsample(samples, self.keep_no_answer)

        # Get features for each sample
        for num, sample in enumerate(samples):
            sample.id = f"{basket_id_internal}-{num}"
            features = self._sample_to_features(sample)
            sample.features = features

        return samples

    def _apply_tokenization(self, dictionary, tokenizer, answer_types_list=[]):
        raw_baskets = []
        dictionary = convert_qa_input_dict(dictionary)
        dictionary["qas"] = self._is_impossible_to_answer_type(dictionary["qas"])
        document_text = dictionary["context"]

        document_tokenized = tokenize_with_metadata(document_text, tokenizer)
        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
        questions = dictionary["qas"]
        for question in questions:
            answers = []
            # For training and dev with labelled examples
            try:
                external_id = question["id"]
                question_text = question["question"]
                for answer in question["answers"]:
                    if 'answer_type' in answer.keys() and answer['answer_type'] in answer_types_list:
                        answer_type = answer['answer_type']
                    else:
                        if answer["text"] == "":
                            answer_type = "no_answer"
                        else:
                            answer_type = "span"
                    a = {"text": answer["text"],
                         "offset": answer["answer_start"],
                         "answer_type": answer_type}
                    answers.append(a)
            # For inference where samples are read in as dicts without an id or answers
            except TypeError:
                external_id = try_get(ID_NAMES, dictionary)
                question_text = question

            question_tokenized = tokenize_with_metadata(question_text, tokenizer)
            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]

            # During inference, there is no_answer type. Also, question might be a str instead of a dict
            if type(question) == str:
                answer_type = None
            elif type(question) == dict:
                answer_type = question.get("answer_type", None)
            else:
                raise Exception("Question was neither in str nor dict format")

            raw = {"document_text": document_text,
                   "document_tokens": document_tokenized["tokens"],
                   "document_offsets": document_tokenized["offsets"],
                   "document_start_of_word": document_start_of_word,
                   "question_text": question_text,
                   "question_tokens": question_tokenized["tokens"],
                   "question_offsets": question_tokenized["offsets"],
                   "question_start_of_word": question_start_of_word,
                   "answers": answers,
                   "answer_type": answer_type,
                   "external_id": external_id}
            raw_baskets.append(raw)
        return raw_baskets

    def _is_impossible_to_answer_type(self, qas):
        """ Converts questions from having an is_impossible field to having an answer_type field"""
        new_qas = []
        for q in qas:
            answer_type = "span"
            if "is_impossible" in q:
                if q["is_impossible"] == True:
                    answer_type = "no_answer"
                del q["is_impossible"]
                q["answer_type"] = answer_type
            new_qas.append(q)
        return new_qas


class TextSimilarityProcessor(Processor):
    """
    Used to handle the Dense Passage Retrieval (DPR) datasets that come in json format, example: biencoder-nq-train.json, biencoder-nq-dev.json, trivia-train.json, trivia-dev.json

    Datasets can be downloaded from the official DPR github repository (https://github.com/facebookresearch/DPR)
    dataset format: list of dictionaries with keys: 'dataset', 'question', 'answers', 'positive_ctxs', 'negative_ctxs', 'hard_negative_ctxs'
    Each sample is a dictionary of format:
    {"dataset": str,
    "question": str,
    "answers": list of str
    "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
    "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
    "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
    }

    """
    def __init__(
        self,
        query_tokenizer,
        passage_tokenizer,
        max_seq_len_query,
        max_seq_len_passage,
        data_dir="",
        metric=None,
        train_filename="train.json",
        dev_filename=None,
        test_filename="test.json",
        dev_split=0.1,
        proxies=None,
        max_samples=None,
        embed_title=True,
        num_positives=1,
        num_hard_negatives=1,
        shuffle_negatives=True,
        shuffle_positives=False,
        label_list=None,
        **kwargs
    ):
        """
        :param query_tokenizer: Used to split a question (str) into tokens
        :param passage_tokenizer: Used to split a passage (str) into tokens.
        :param max_seq_len_query: Query samples are truncated after this many tokens.
        :type max_seq_len_query: int
        :param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
        :type max_seq_len_passage: int
        :param data_dir: The directory in which the train and dev files can be found.
                         If not available the dataset will be loaded automaticaly
                         if the last directory has the same name as a predefined dataset.
                         These predefined datasets are defined as the keys in the dict at
                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
        :type data_dir: str
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
                 For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param max_samples: maximum number of samples to use
        :type max_samples: int
        :param embed_title: Whether to embed title in passages during tensorization (bool),
        :param num_hard_negatives: maximum number to hard negative context passages in a sample
        :param num_positives: maximum number to positive context passages in a sample
        :param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the num_hard_negative number of passages
        :type shuffle_negatives: bool
        :param shuffle_positives: Whether to shuffle all the positive passages before selecting the num_positive number of passages
        :type shuffle_positives: bool
        :param label_list: list of labels to predict. Usually ["hard_negative", "positive"]
        :type label_list: list[str]
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs

        # Custom processor attributes
        self.max_samples = max_samples
        self.query_tokenizer = query_tokenizer
        self.passage_tokenizer = passage_tokenizer
        self.embed_title = embed_title
        self.num_hard_negatives = num_hard_negatives
        self.num_positives = num_positives
        self.shuffle_negatives = shuffle_negatives
        self.shuffle_positives = shuffle_positives
        self.max_seq_len_query = max_seq_len_query
        self.max_seq_len_passage = max_seq_len_passage

        super(TextSimilarityProcessor, self).__init__(
            tokenizer=None,
            max_seq_len=None,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,
        )
        if metric:
            self.add_task(name="text_similarity",
                          metric=metric,
                          label_list=label_list,
                          label_name="label",
                          task_type="text_similarity")
        else:
            logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
                        "using the default task or add a custom task later via processor.add_task()")

    @classmethod
    def load_from_dir(cls, load_dir):
        """
         Overwriting method from parent class to **always** load the TextSimilarityProcessor instead of the specific class stored in the config.

        :param load_dir: str, directory that contains a 'processor_config.json'
        :return: An instance of an TextSimilarityProcessor
        """
        # read config
        processor_config_file = Path(load_dir) / "processor_config.json"
        config = json.load(open(processor_config_file))
        # init tokenizer
        query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
        passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder = "passage")

        # we have to delete the tokenizer string from config, because we pass it as Object
        del config["query_tokenizer"]
        del config["passage_tokenizer"]

        processor = cls.load(query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, processor_name="TextSimilarityProcessor", **config)
        for task_name, task in config["tasks"].items():
            processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])

        if processor is None:
            raise Exception

        return processor

    def save(self, save_dir):
        """
        Saves the vocabulary to file and also creates a json file containing all the
        information needed to load the same processor.

        :param save_dir: Directory where the files are to be saved
        :type save_dir: str
        """
        if isinstance(save_dir,str):
            save_dir = Path(save_dir)
        os.makedirs(save_dir, exist_ok=True)
        config = self.generate_config()
        # save tokenizer incl. attributes
        config["query_tokenizer"] = self.query_tokenizer.__class__.__name__
        config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__

        # Because the fast tokenizers expect a str and not Path
        # always convert Path to str here.
        self.query_tokenizer.save_pretrained(str(save_dir / "query"))
        self.passage_tokenizer.save_pretrained(str(save_dir / "passage"))

        # save processor
        config["processor"] = self.__class__.__name__
        output_config_file = Path(save_dir) / "processor_config.json"
        with open(output_config_file, "w") as file:
            json.dump(config, file)

    def file_to_dicts(self, file: str) -> [dict]:
        """
        Converts a Dense Passage Retrieval (DPR) data file in json format to a list of dictionaries.

        :param file: filename of DPR data in json format
                Each sample is a dictionary of format:
                {"dataset": str,
                "question": str,
                "answers": list of str
                "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
                "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
                "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
                }


        Returns:
        list of dictionaries: List[dict]
            each dictionary:
            {"query": str,
            "passages": [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
            {"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
            ...]}
        """
        dicts = read_dpr_json(file, max_samples=self.max_samples, num_hard_negatives=self.num_hard_negatives, num_positives=self.num_positives, shuffle_negatives=self.shuffle_negatives, shuffle_positives=self.shuffle_positives)

        # shuffle dicts to make sure that similar positive passages do not end up in one batch
        dicts = random.sample(dicts, len(dicts))
        return dicts

    def dataset_from_dicts(self, dicts, indices=None, return_baskets = False):
        """
        Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
        For conversion we have an internal representation called "baskets".
        Each basket is one query and related text passages (positive passages fitting to the query and negative
        passages that do not fit the query)
        Each stage adds or transforms specific information to our baskets.

        :param dicts: dict, input dictionary with DPR-style content
                        {"query": str,
                         "passages": List[
                                        {'title': str,
                                        'text': str,
                                        'label': 'hard_negative',
                                        'external_id': str},
                                        ....
                                        ]
                         }
        :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
        :param return_baskets: boolean, weather to return the baskets or not (baskets are needed during inference)
        """

        # Take the dict and insert into our basket structure, this stages also adds an internal IDs
        baskets = self._fill_baskets(dicts, indices)

        # Separat conversion of query
        baskets = self._convert_queries(baskets=baskets)

        # and context passages. When converting the context the label is also assigned.
        baskets = self._convert_contexts(baskets=baskets)

        # Convert features into pytorch dataset, this step also removes and logs potential errors during preprocessing
        dataset, tensor_names, problematic_ids, baskets = self._create_dataset(baskets)

        if problematic_ids:
            logger.error(f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}")

        if return_baskets:
            return dataset, tensor_names, problematic_ids, baskets
        else:
            return dataset, tensor_names, problematic_ids

    def _fill_baskets(self, dicts, indices):
        baskets = []
        if not indices:
            indices = range(len(dicts))
        for d, id_internal in zip(dicts,indices):
            basket = SampleBasket(id_external=None,
                                  id_internal=id_internal,
                                  raw=d)
            baskets.append(basket)
        return baskets

    def _convert_queries(self, baskets):
        for basket in baskets:
            clear_text = {}
            tokenized = {}
            features = [{}]
            # extract query, positive context passages and titles, hard-negative passages and titles
            if "query" in basket.raw:
                try:
                    query = self._normalize_question(basket.raw["query"])

                    # featurize the query
                    query_inputs = self.query_tokenizer.encode_plus(
                        text=query,
                        max_length=self.max_seq_len_query,
                        add_special_tokens=True,
                        truncation=True,
                        truncation_strategy='longest_first',
                        padding="max_length",
                        return_token_type_ids=True,
                    )

                    # tokenize query
                    tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_inputs["input_ids"])

                    if len(tokenized_query) == 0:
                        logger.warning(
                            f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize")
                        return None

                    clear_text["query_text"] = query
                    tokenized["query_tokens"] = tokenized_query
                    features[0]["query_input_ids"] = query_inputs["input_ids"]
                    features[0]["query_segment_ids"] = query_inputs["token_type_ids"]
                    features[0]["query_attention_mask"] = query_inputs["attention_mask"]
                except Exception as e:
                    features = None

            sample = Sample(id=None,
                            clear_text=clear_text,
                            tokenized=tokenized,
                            features=features)
            basket.samples = [sample]
        return baskets

    def _convert_contexts(self, baskets):
        for basket in baskets:
            if "passages" in basket.raw:
                try:
                    positive_context = list(filter(lambda x: x["label"] == "positive", basket.raw["passages"]))
                    if self.shuffle_positives:
                        random.shuffle(positive_context)
                    positive_context = positive_context[:self.num_positives]
                    hard_negative_context = list(filter(lambda x: x["label"] == "hard_negative", basket.raw["passages"]))
                    if self.shuffle_negatives:
                        random.shuffle(hard_negative_context)
                    hard_negative_context = hard_negative_context[:self.num_hard_negatives]

                    positive_ctx_titles = [passage.get("title", None) for passage in positive_context]
                    positive_ctx_texts = [passage["text"] for passage in positive_context]
                    hard_negative_ctx_titles = [passage.get("title", None) for passage in hard_negative_context]
                    hard_negative_ctx_texts = [passage["text"] for passage in hard_negative_context]

                    # all context passages and labels: 1 for positive context and 0 for hard-negative context
                    ctx_label = [1] * self.num_positives + [0] * self.num_hard_negatives
                    # featurize context passages
                    if self.embed_title:
                        # concatenate title with positive context passages + negative context passages
                        all_ctx = self._combine_title_context(positive_ctx_titles, positive_ctx_texts) + \
                                  self._combine_title_context(hard_negative_ctx_titles, hard_negative_ctx_texts)
                    else:
                        all_ctx = positive_ctx_texts + hard_negative_ctx_texts

                    # assign empty string tuples if hard_negative passages less than num_hard_negatives
                    all_ctx += [('', '')] * ((self.num_positives + self.num_hard_negatives) - len(all_ctx))

                    ctx_inputs = self.passage_tokenizer.batch_encode_plus(
                        all_ctx,
                        add_special_tokens=True,
                        truncation=True,
                        padding="max_length",
                        max_length=self.max_seq_len_passage,
                        return_token_type_ids=True
                    )

                    # TODO check if we need this and potentially remove
                    ctx_segment_ids = np.zeros_like(ctx_inputs["token_type_ids"], dtype=np.int32)

                    # get tokens in string format
                    tokenized_passage = [self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in ctx_inputs["input_ids"]]

                    # for DPR we only have one sample containing query and corresponding (multiple) context features
                    sample = basket.samples[0]
                    sample.clear_text["passages"] = positive_context + hard_negative_context
                    sample.tokenized["passages_tokens"] = tokenized_passage
                    sample.features[0]["passage_input_ids"] = ctx_inputs["input_ids"]
                    sample.features[0]["passage_segment_ids"] = ctx_segment_ids
                    sample.features[0]["passage_attention_mask"] = ctx_inputs["attention_mask"]
                    sample.features[0]["label_ids"] = ctx_label
                except Exception as e:
                    basket.samples[0].features = None

        return baskets

    def _create_dataset(self, baskets):
        """
        Convert python features into pytorch dataset.
        Also removes potential errors during preprocessing.
        Flattens nested basket structure to create a flat list of features
        """
        features_flat = []
        basket_to_remove = []
        problematic_ids = set()
        for basket in baskets:
            if self._check_sample_features(basket):
                for sample in basket.samples:
                    features_flat.extend(sample.features)
            else:
                # remove the entire basket
                basket_to_remove.append(basket)
        if len(basket_to_remove) > 0:
            for basket in basket_to_remove:
                # if basket_to_remove is not empty remove the related baskets
                problematic_ids.add(basket.id_internal)
                baskets.remove(basket)

        dataset, tensor_names = convert_features_to_dataset(features=features_flat)
        return dataset, tensor_names, problematic_ids, baskets

    @staticmethod
    def _normalize_question(question: str) -> str:
        """Removes '?' from queries/questions"""
        if question[-1] == '?':
            question = question[:-1]
        return question

    @staticmethod
    def _combine_title_context(titles, texts):
        res = []
        for title, ctx in zip(titles, texts):
            if title is None:
                title = ""
                logger.warning(
                    f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' ")
            res.append(tuple((title, ctx)))
        return res