From 6174381bf2b82b972dc8b216a6c1acdd8205977b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Thu, 15 Dec 2022 17:58:29 +0100
Subject: [PATCH 01/12] Adapt first version of BigBio adapter implementation

---
 flair/datasets/biomedical.py | 195 +++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 0f0ba9ca26..43f1915790 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -5167,3 +5167,198 @@ class HUNER_SPECIES(HunerMultiCorpus):
 
     def __init__(self, sentence_splitter: SentenceSplitter = None):
         super(HUNER_SPECIES, self).__init__(entity_type="SPECIES", sentence_splitter=sentence_splitter)
+
+
+class BIGBIO_NER_CORPUS(ColumnCorpus):
+    """
+        This class implements an adapter to data sets implemented in the BigBio framework:
+
+            https://github.com/bigscience-workshop/biomedical
+
+        The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform
+        programming api to access them. This adapter allows to use all named entity recognition
+        data sets by using the bigbio_kb schema.
+    """
+
+    def __init__(
+            self,
+            dataset_name: str,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+            sentence_splitter: SentenceSplitter = None
+    ):
+        """
+        :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem)
+        :param base_path: Path to the corpus on your machine
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which
+            segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
+        """
+
+        if base_path is None:
+            base_path = flair.cache_root / "datasets"
+        else:
+            base_path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # build dataset name and full huggingface reference name
+        if not dataset_name.startswith("bigbio/"):
+            full_dataset_name = "bigbio" + "/" + dataset_name
+        else:
+            full_dataset_name = dataset_name
+            dataset_name = dataset_name.replace("bigbio/", "")
+
+        dataset_dir_name = self.build_corpus_directory_name(dataset_name)
+        data_folder = base_path / dataset_dir_name
+
+        train_file = data_folder / "train.conll"
+        test_file = data_folder / "test.conll"
+
+        if not (train_file.exists() and test_file.exists()):
+            from datasets import load_dataset
+            dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb")
+
+            splits = {
+                "train": self.to_internal_dataset(dataset, "train"),
+                "test": self.to_internal_dataset(dataset, "test")
+            }
+
+            # Not every dataset has a dev / validation set!
+            if "validation" in dataset:
+                splits["dev"] = self.to_internal_dataset(dataset, "validation")
+
+            # Perform type mapping if necessary
+            type_mapping = self.get_entity_type_mapping()
+            if type_mapping:
+                splits = {
+                    split: filter_and_map_entities(dataset, type_mapping)
+                    for split, dataset in splits.items()
+                }
+
+            if sentence_splitter is None:
+                sentence_splitter = SciSpacySentenceSplitter()
+
+            conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
+            conll_writer.process_dataset(splits, data_folder)
+
+        super(BIGBIO_NER_CORPUS, self).__init__(
+            data_folder,
+            columns,
+            in_memory=in_memory,
+            comment_symbol="#",
+        )
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        """
+            Return the mapping of entity type given in the dataset to canonical types. Note, if
+            a entity type is not present in the map it is discarded.
+        """
+        return None
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        """
+            Builds the directory name for the given data set.
+        """
+        return "bigbio-" + dataset_name.lower()
+
+    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
+        """
+            Converts a dataset given in hugging datasets format to our internal corpus representation.
+        """
+        id_to_text = {}
+        id_to_entities = {}
+        for document in dataset[split]:
+            document_id = document["document_id"]
+            passage_offsets = []
+
+            # Collect all texts of the document, each passage will be
+            # a text in our internal format
+            for passage in document["passages"]:
+                passage_id = document_id + "#" + str(passage["id"])
+                id_to_text[passage_id] = " ".join(passage["text"])
+                passage_offsets.append((passage_id, passage["offsets"]))
+
+                id_to_entities[passage_id] = []
+
+            # Sort passages by start offset
+            passage_offsets = sorted(passage_offsets, key=lambda e: e[1][0][0])
+
+            # Transform all entity annotations into internal format
+            for entity in document["entities"]:
+                # Find the passage of the entity (necessary for offset adaption)
+                passage_id, passage_offset = self.bin_search_passage(passage_offsets, 0, len(passage_offsets)-1, entity)
+
+                # Adapt entity offsets according to passage offsets
+                entity_offset = entity["offsets"][0]
+                entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0])
+
+                id_to_entities[passage_id].append(
+                    Entity(char_span=entity_offset, entity_type=entity["type"])
+                )
+
+                # FIXME: This is just for debugging purposes
+                passage_text = id_to_text[passage_id]
+                doc_text = passage_text[entity_offset[0]:entity_offset[1]]
+                mention_text = entity["text"][0]
+                if doc_text != mention_text:
+                    print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}")
+
+        return InternalBioNerDataset(
+            documents=id_to_text,
+            entities_per_document=id_to_entities
+        )
+
+    def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict):
+        """
+            Helper methods to find the passage to a given entity mention (incl. offset). The implementation
+            uses binary search to find the passage in the ordered sequence passages.
+        """
+        # Check base case
+        if high >= low:
+            # Get element in the middle
+            mid = (high + low) // 2
+            first_text_offset = passages[mid][1][0]
+            first_mention_offset = entity["offsets"][0]
+
+            # Is the mention with the passage offsets?
+            if first_mention_offset[0] >= first_text_offset[0] and first_mention_offset[1] <= first_text_offset[1]:
+                return passages[mid][0], first_text_offset
+
+            # If element is smaller than mid, then it can only
+            # be present in left subarray
+            elif first_text_offset[0] > first_mention_offset[0]:
+                return self.bin_search_passage(passages, low, mid - 1, entity)
+            else:
+                # Else the element can only be present in right subarray
+                return self.bin_search_passage(passages, mid + 1, high, entity)
+
+        else:
+            # This should never happen :-D
+            return None
+
+
+class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS):
+
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Chemical": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
+
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene_or_gene_product": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+

From 20821ec80f806c36227e7c0f1ea8570ce5bae79e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Thu, 19 Jan 2023 09:37:30 +0100
Subject: [PATCH 02/12] Bug fix: only write dev/val split if it exists

---
 flair/datasets/biomedical.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 43f1915790..f29d9bde03 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -350,9 +350,11 @@ def __init__(
 
     def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path):
         self.write_to_conll(datasets["train"], out_dir / "train.conll")
-        self.write_to_conll(datasets["dev"], out_dir / "dev.conll")
         self.write_to_conll(datasets["test"], out_dir / "test.conll")
 
+        if "dev" in datasets:
+            self.write_to_conll(datasets["dev"], out_dir / "dev.conll")
+
     def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
         os.makedirs(str(output_file.parent), exist_ok=True)
         filter_nested_entities(dataset)

From 35c9330fe3cf103033747dd5928f464976b9bdc0 Mon Sep 17 00:00:00 2001
From: Xing Wang <wangxida@guppi4>
Date: Tue, 7 Feb 2023 12:14:41 +0100
Subject: [PATCH 03/12] Added BigBio adapter classes for new datasets of
 Hunflair v2

---
 flair/datasets/biomedical.py | 390 +++++++++++++++++++++++++++++++----
 1 file changed, 345 insertions(+), 45 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 17a50b911f..706d32cda2 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -349,11 +349,12 @@ def __init__(
         self.sentence_splitter = sentence_splitter
 
     def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path):
-        self.write_to_conll(datasets["train"], out_dir / "train.conll")
-        self.write_to_conll(datasets["test"], out_dir / "test.conll")
-
+        if "train" in datasets:
+            self.write_to_conll(datasets["train"], out_dir / "train.conll")
         if "dev" in datasets:
             self.write_to_conll(datasets["dev"], out_dir / "dev.conll")
+        if "test" in datasets:
+            self.write_to_conll(datasets["test"], out_dir / "test.conll")
 
     def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
         os.makedirs(str(output_file.parent), exist_ok=True)
@@ -1727,7 +1728,7 @@ class HUNER_CHEMICAL_CHEMDNER(HunerDataset):
     """
 
     def __init__(self, *args, download_folder=None, **kwargs):
-        self.download_folder = download_folder or CHEMDNER.default_dir / "original"
+        self.download_folder = download_folder
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1735,6 +1736,7 @@ def split_url() -> str:
         return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner"
 
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
+        self.download_folder = data_dir / "original"
         os.makedirs(str(self.download_folder), exist_ok=True)
         CHEMDNER.download_dataset(self.download_folder)
         train_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "training.bioc.xml")
@@ -5173,21 +5175,21 @@ def __init__(self, sentence_splitter: SentenceSplitter = None):
 
 class BIGBIO_NER_CORPUS(ColumnCorpus):
     """
-        This class implements an adapter to data sets implemented in the BigBio framework:
+    This class implements an adapter to data sets implemented in the BigBio framework:
 
-            https://github.com/bigscience-workshop/biomedical
+        https://github.com/bigscience-workshop/biomedical
 
-        The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform
-        programming api to access them. This adapter allows to use all named entity recognition
-        data sets by using the bigbio_kb schema.
+    The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform
+    programming api to access them. This adapter allows to use all named entity recognition
+    data sets by using the bigbio_kb schema.
     """
 
     def __init__(
-            self,
-            dataset_name: str,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            sentence_splitter: SentenceSplitter = None
+        self,
+        dataset_name: str,
+        base_path: Union[str, Path] = None,
+        in_memory: bool = True,
+        sentence_splitter: SentenceSplitter = None,
     ):
         """
         :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem)
@@ -5218,26 +5220,43 @@ def __init__(
         train_file = data_folder / "train.conll"
         test_file = data_folder / "test.conll"
 
-        if not (train_file.exists() and test_file.exists()):
+        # Download data if necessary
+        # Some datasets only have train or test splits, not both
+        if not train_file.exists() and not test_file.exists():
             from datasets import load_dataset
+
             dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb")
 
-            splits = {
-                "train": self.to_internal_dataset(dataset, "train"),
-                "test": self.to_internal_dataset(dataset, "test")
-            }
+            # Special case for ProGene: We need to use the split_0_train and split_0_test splits
+            train_split_name = None
+            if "train" in dataset:
+                train_split_name = "train"
+            elif "split_0_train" in dataset:
+                train_split_name = "split_0_train"
+            test_split_name = None
+            if "test" in dataset:
+                test_split_name = "test"
+            elif "split_0_test" in dataset:
+                test_split_name = "split_0_test"
+            validation_split_name = None
+            if "validation" in dataset:
+                validation_split_name = "validation"
+            elif "split_0_validation" in dataset:
+                validation_split_name = "split_0_validation"
 
+            splits = {}
             # Not every dataset has a dev / validation set!
-            if "validation" in dataset:
-                splits["dev"] = self.to_internal_dataset(dataset, "validation")
+            if train_split_name is not None:
+                splits["train"] = self.to_internal_dataset(dataset, train_split_name)
+            if test_split_name is not None:
+                splits["test"] = self.to_internal_dataset(dataset, test_split_name)
+            if validation_split_name is not None:
+                splits["dev"] = self.to_internal_dataset(dataset, validation_split_name)
 
             # Perform type mapping if necessary
             type_mapping = self.get_entity_type_mapping()
             if type_mapping:
-                splits = {
-                    split: filter_and_map_entities(dataset, type_mapping)
-                    for split, dataset in splits.items()
-                }
+                splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()}
 
             if sentence_splitter is None:
                 sentence_splitter = SciSpacySentenceSplitter()
@@ -5254,20 +5273,20 @@ def __init__(
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         """
-            Return the mapping of entity type given in the dataset to canonical types. Note, if
-            a entity type is not present in the map it is discarded.
+        Return the mapping of entity type given in the dataset to canonical types. Note, if
+        a entity type is not present in the map it is discarded.
         """
         return None
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         """
-            Builds the directory name for the given data set.
+        Builds the directory name for the given data set.
         """
         return "bigbio-" + dataset_name.lower()
 
     def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
         """
-            Converts a dataset given in hugging datasets format to our internal corpus representation.
+        Converts a dataset given in hugging datasets format to our internal corpus representation.
         """
         id_to_text = {}
         id_to_entities = {}
@@ -5290,32 +5309,29 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
             # Transform all entity annotations into internal format
             for entity in document["entities"]:
                 # Find the passage of the entity (necessary for offset adaption)
-                passage_id, passage_offset = self.bin_search_passage(passage_offsets, 0, len(passage_offsets)-1, entity)
+                passage_id, passage_offset = self.bin_search_passage(
+                    passage_offsets, 0, len(passage_offsets) - 1, entity
+                )
 
                 # Adapt entity offsets according to passage offsets
                 entity_offset = entity["offsets"][0]
                 entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0])
 
-                id_to_entities[passage_id].append(
-                    Entity(char_span=entity_offset, entity_type=entity["type"])
-                )
+                id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"]))
 
                 # FIXME: This is just for debugging purposes
                 passage_text = id_to_text[passage_id]
-                doc_text = passage_text[entity_offset[0]:entity_offset[1]]
+                doc_text = passage_text[entity_offset[0] : entity_offset[1]]
                 mention_text = entity["text"][0]
                 if doc_text != mention_text:
                     print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}")
 
-        return InternalBioNerDataset(
-            documents=id_to_text,
-            entities_per_document=id_to_entities
-        )
+        return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities)
 
     def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict):
         """
-            Helper methods to find the passage to a given entity mention (incl. offset). The implementation
-            uses binary search to find the passage in the ordered sequence passages.
+        Helper methods to find the passage to a given entity mention (incl. offset). The implementation
+        uses binary search to find the passage in the ordered sequence passages.
         """
         # Check base case
         if high >= low:
@@ -5341,10 +5357,174 @@ def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]],
             return None
 
 
-class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS):
+class HUNER_GENE_NLM_GENE(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_NLM_GENE, self).__init__(*args, dataset_name="nlm_gene", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene": GENE_TAG, "GENERIF": GENE_TAG, "STARGENE": GENE_TAG, "Domain": GENE_TAG, "Other": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
 
+class HUNER_GENE_DRUGPROT(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
-        super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs)
+        super(HUNER_GENE_DRUGPROT, self).__init__(*args, dataset_name="drugprot", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"GENE-N": GENE_TAG, "GENE-Y": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_DRUGPROT(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_DRUGPROT, self).__init__(*args, dataset_name="drugprot", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"CHEMICAL": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"GeneOrGeneProduct": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"ChemicalEntity": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_DISEASE_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_DISEASE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"DiseaseOrPhenotypicFeature": DISEASE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_SPECIES_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"OrganismTaxon": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_CELL_LINE_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"CellLine": CELL_LINE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_CPI(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_CPI, self).__init__(*args, dataset_name="cpi", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_CPI(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_CPI, self).__init__(*args, dataset_name="cpi", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"compound": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2013_PC, self).__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene_or_gene_product": GENE_TAG, "Complex": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_BIONLP_ST_2013_PC, self).__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Simple_chemical": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2013_GE(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2013_GE, self).__init__(*args, dataset_name="bionlp_st_2013_ge", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2011_GE(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2011_GE, self).__init__(*args, dataset_name="bionlp_st_2011_ge", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs)
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         return {"Chemical": CHEMICAL_TAG}
@@ -5353,14 +5533,134 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
-class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
+class HUNER_SPECIES_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_SPECIES_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Organism": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2011_REL(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2011_REL, self).__init__(*args, dataset_name="bionlp_st_2011_rel", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_BIONLP_ST_2011_EPI(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_BIONLP_ST_2011_EPI, self).__init__(*args, dataset_name="bionlp_st_2011_epi", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Protein": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_SPECIES_BIONLP_ST_2019_BB(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_SPECIES_BIONLP_ST_2019_BB, self).__init__(*args, dataset_name="bionlp_st_2019_bb", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Microorganism": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
 
+class HUNER_GENE_BIOID(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
-        super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs)
+        super(HUNER_GENE_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs)
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"Gene_or_gene_product": GENE_TAG}
+        return {"gene": GENE_TAG, "protein": GENE_TAG}
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
+
+class HUNER_CHEMICAL_BIOID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"chemical": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_SPECIES_BIOID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_SPECIES_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"species": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CELL_LINE_BIOID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CELL_LINE_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"cell": CELL_LINE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_GNORMPLUS(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_GNORMPLUS, self).__init__(*args, dataset_name="gnormplus", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene": GENE_TAG, "FamilyName": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_GENE_PROGENE(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_PROGENE, self).__init__(*args, dataset_name="progene", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"progene_text": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Chemical": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+# Already implemented earlier
+# class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
+#     def __init__(self, *args, **kwargs):
+#         super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs)
+
+#     def get_entity_type_mapping(self) -> Optional[Dict]:
+#         return {"Gene_or_gene_product": GENE_TAG}
+
+#     def build_corpus_directory_name(self, dataset_name: str) -> str:
+#         return self.__class__.__name__.lower()

From c85760283ddd36f0964707c4eb3d754d87aaa97d Mon Sep 17 00:00:00 2001
From: Xing Wang <wangxida@guppi4>
Date: Tue, 7 Feb 2023 12:19:36 +0100
Subject: [PATCH 04/12] merged current master (07/02/23) into branch bigbio
 integration

---
 flair/datasets/biomedical.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 335e9d4e79..1ad2d97bb0 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -1689,12 +1689,7 @@ class HUNER_CHEMICAL_CHEMDNER(HunerDataset):
     HUNER version of the CHEMDNER corpus containing chemical annotations.
     """
 
-<<<<<<< HEAD
-    def __init__(self, *args, download_folder=None, **kwargs):
-        self.download_folder = download_folder
-=======
     def __init__(self, *args, **kwargs):
->>>>>>> master
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1702,20 +1697,11 @@ def split_url() -> str:
         return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner"
 
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-<<<<<<< HEAD
-        self.download_folder = data_dir / "original"
-        os.makedirs(str(self.download_folder), exist_ok=True)
-        CHEMDNER.download_dataset(self.download_folder)
-        train_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "training.bioc.xml")
-        dev_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "development.bioc.xml")
-        test_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "evaluation.bioc.xml")
-=======
         os.makedirs(str(data_dir), exist_ok=True)
         CHEMDNER.download_dataset(data_dir)
         train_data = bioc_to_internal(data_dir / "chemdner_corpus" / "training.bioc.xml")
         dev_data = bioc_to_internal(data_dir / "chemdner_corpus" / "development.bioc.xml")
         test_data = bioc_to_internal(data_dir / "chemdner_corpus" / "evaluation.bioc.xml")
->>>>>>> master
         all_data = merge_datasets([train_data, dev_data, test_data])
         all_data = filter_and_map_entities(
             all_data,

From 804247bbeea977c568cd943f5c4385d36f956091 Mon Sep 17 00:00:00 2001
From: Xing Wang <wangxida@guppi4>
Date: Mon, 13 Feb 2023 10:09:18 +0100
Subject: [PATCH 05/12] added new BigBio dataset to HunFlair

---
 flair/datasets/biomedical.py | 194 ++++++++++++++++++++++++++++++-----
 1 file changed, 169 insertions(+), 25 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 1ad2d97bb0..af817aa45b 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -5162,6 +5162,9 @@ def __init__(
         base_path: Union[str, Path] = None,
         in_memory: bool = True,
         sentence_splitter: SentenceSplitter = None,
+        train_split_name: Union[str, bool] = None,
+        dev_split_name: Union[str, bool] = None,
+        test_split_name: Union[str, bool] = None,
     ):
         """
         :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem)
@@ -5169,6 +5172,9 @@ def __init__(
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which
             segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
+        :param train_split_name: Name of the training split in bigbio, usually train (default: None)
+        :param dev_split_name: Name of the development split in bigbio, usually validation (default: None)
+        :param test_split_name: Name of the test split in bigbio, usually test (default: None)
         """
 
         if base_path is None:
@@ -5190,31 +5196,25 @@ def __init__(
         data_folder = base_path / dataset_dir_name
 
         train_file = data_folder / "train.conll"
-        test_file = data_folder / "test.conll"
+        # test_file = data_folder / "test.conll"
 
         # Download data if necessary
-        # Some datasets only have train or test splits, not both
-        if not train_file.exists() and not test_file.exists():
+        # Some datasets in BigBio only have train or test splits, not both
+        # If only test split, assign it to train split
+        # If only train split, sample other from it (sample_missing_splits=True)
+        if not train_file.exists():
             from datasets import load_dataset
 
             dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb")
 
-            # Special case for ProGene: We need to use the split_0_train and split_0_test splits
-            train_split_name = None
             if "train" in dataset:
                 train_split_name = "train"
-            elif "split_0_train" in dataset:
-                train_split_name = "split_0_train"
-            test_split_name = None
             if "test" in dataset:
                 test_split_name = "test"
-            elif "split_0_test" in dataset:
-                test_split_name = "split_0_test"
-            validation_split_name = None
             if "validation" in dataset:
-                validation_split_name = "validation"
-            elif "split_0_validation" in dataset:
-                validation_split_name = "split_0_validation"
+                dev_split_name = "validation"
+
+            assert not (train_split_name is None and test_split_name is None)
 
             splits = {}
             # Not every dataset has a dev / validation set!
@@ -5222,8 +5222,10 @@ def __init__(
                 splits["train"] = self.to_internal_dataset(dataset, train_split_name)
             if test_split_name is not None:
                 splits["test"] = self.to_internal_dataset(dataset, test_split_name)
-            if validation_split_name is not None:
-                splits["dev"] = self.to_internal_dataset(dataset, validation_split_name)
+            if dev_split_name is not None:
+                splits["dev"] = self.to_internal_dataset(dataset, dev_split_name)
+            if "train" not in splits and "test" in splits:
+                splits["train"] = splits.pop("test")
 
             # Perform type mapping if necessary
             type_mapping = self.get_entity_type_mapping()
@@ -5237,10 +5239,7 @@ def __init__(
             conll_writer.process_dataset(splits, data_folder)
 
         super(BIGBIO_NER_CORPUS, self).__init__(
-            data_folder,
-            columns,
-            in_memory=in_memory,
-            comment_symbol="#",
+            data_folder, columns, in_memory=in_memory, comment_symbol="#", sample_missing_splits=True
         )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
@@ -5395,9 +5394,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
-class HUNER_GENE_SPECIES_BIORED(BIGBIO_NER_CORPUS):
+class HUNER_SPECIES_BIORED(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
-        super(HUNER_GENE_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+        super(HUNER_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         return {"OrganismTaxon": SPECIES_TAG}
@@ -5406,9 +5405,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
-class HUNER_GENE_CELL_LINE_BIORED(BIGBIO_NER_CORPUS):
+class HUNER_CELL_LINE_BIORED(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
-        super(HUNER_GENE_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
+        super(HUNER_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs)
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         return {"CellLine": CELL_LINE_TAG}
@@ -5606,7 +5605,19 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
 
 class HUNER_GENE_PROGENE(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
-        super(HUNER_GENE_PROGENE, self).__init__(*args, dataset_name="progene", **kwargs)
+        # Special case for ProGene: We need to use the split_0_train and split_0_test splits
+        # as they are currently provided in BigBio
+        train_split_name = "split_0_train"
+        dev_split_name = "split_0_validation"
+        test_split_name = "split_0_test"
+        super(HUNER_GENE_PROGENE, self).__init__(
+            *args,
+            dataset_name="progene",
+            **kwargs,
+            train_split_name=train_split_name,
+            dev_split_name=dev_split_name,
+            test_split_name=test_split_name,
+        )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         return {"progene_text": GENE_TAG}
@@ -5626,6 +5637,139 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_GENE_SETH_CORPUS(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_SETH_CORPUS, self).__init__(*args, dataset_name="seth_corpus", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+# FIXME: Annotation missmatch from the source PubTator files
+# EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene  vs. Mention: DKFZ p686k1684
+def map_fn(example):
+    example["entities"] = [
+        repair_doc_offsets(passages, entities) for passages, entities in zip(example["passages"], example["entities"])
+    ]
+    return example
+
+
+def repair_doc_offsets(passages, entities):
+    """
+    Some offsets are broken in tmvar_v3, we need to fix them. Replace doc in-place.
+    """
+
+    text = " ".join([passage["text"][0] for passage in passages])
+
+    sentences = text.split(". ")
+
+    sentence_indexes = [m.start() + 2 for m in re.finditer("\. ", text)]  # because the suffix is ". "
+    sentence_indexes = [0] + sentence_indexes
+
+    doc_entities = entities
+
+    if len(doc_entities) == 0:
+        return
+
+    # doc_entities = dataset[split].filter(lambda x: x["document_id"] == "21904390")[:]["entities"][0]
+
+    # print(sentence_indexes)
+    # print(len(sentences))
+    # print(text)
+    # print(doc_entities)
+
+    sentence_index = 0
+    entity_index = 0
+    current_offset = 0
+    next_sentence_offset = 0
+    next_entity_offset = text[current_offset:].find(doc_entities[entity_index]["text"][0])
+    while True:
+        if sentence_index >= len(sentence_indexes) and entity_index >= len(doc_entities):
+            break
+        if next_sentence_offset <= next_entity_offset:
+            sentence_end = sentence_indexes[sentence_index] + len(sentences[sentence_index]) + 2
+            # print(f"Sentence {sentence_index} @ offsets {sentence_indexes[sentence_index]} to {sentence_end}")
+            # print(sentences[sentence_index] + ". ")
+            sentence_index += 1
+            if sentence_index >= len(sentence_indexes):
+                next_sentence_offset = len(text)
+            else:
+                next_sentence_offset = sentence_indexes[sentence_index]
+            # print(f"DEBUG next_sentence_offset: {next_sentence_offset}")
+        else:  # next_entity_offset < next_sentence_offset
+            entity = doc_entities[entity_index]
+            entity_name = entity["text"][0]
+            given_offset_start = entity["offsets"][0][0]
+            given_offset_end = entity["offsets"][0][1]
+            # print(f"  {entity_name} @ offsets (real) {next_entity_offset} to {next_entity_offset + len(entity_name)}")
+            # print(f"  {text[given_offset_start:given_offset_end]} @ offset (given) {given_offset_start} to {given_offset_end}")
+            if given_offset_start != next_entity_offset:  # Mismatched entities
+                # print(doc_entities)
+                entity["offsets"][0][0] = next_entity_offset
+                entity["offsets"][0][1] = next_entity_offset + len(entity_name)
+                # print(doc_entities)
+            current_offset = next_entity_offset + len(entity_name)
+            entity_index += 1
+            if entity_index >= len(doc_entities):
+                next_entity_offset = len(text)
+            else:
+                next_entity_offset = current_offset + text[current_offset:].find(doc_entities[entity_index]["text"][0])
+
+    return doc_entities
+
+
+class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"['Gene']": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+    # Some offsets are broken in tmvar_v3, we need to fix them
+    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
+        """
+        Converts a dataset given in hugging datasets format to our internal corpus representation.
+        """
+        dataset = dataset.map(map_fn, batched=True)
+        return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split)
+
+
+class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_SPECIES_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"['Species']": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+    # Some offsets are broken in tmvar_v3, we need to fix them
+    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
+        """
+        Converts a dataset given in hugging datasets format to our internal corpus representation.
+        """
+        dataset = dataset.map(map_fn, batched=True)
+        return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split)
+
+
+# class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS):
+#     def __init__(self, *args, **kwargs):
+#         super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
+
+#     def get_entity_type_mapping(self) -> Optional[Dict]:
+#         return {"['CellLine']": CELL_LINE_TAG}
+
+#     def build_corpus_directory_name(self, dataset_name: str) -> str:
+#         return self.__class__.__name__.lower()
+
+
 # Already implemented earlier
 # class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
 #     def __init__(self, *args, **kwargs):

From 72a7ddf61b1659905f3b616c56a8ddb7cda4d3f1 Mon Sep 17 00:00:00 2001
From: Xing Wang <wangxida@guppi4>
Date: Tue, 28 Feb 2023 17:23:56 +0100
Subject: [PATCH 06/12] finished BigBio integration for HunFlair v2

---
 flair/datasets/biomedical.py | 130 +++++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 53 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index af817aa45b..21052a6445 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -2770,7 +2770,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
                 start, end = int(fields[2]), int(fields[3])
 
                 if start == end:
-                    continue
+                    continue_S800
 
                 entities_per_document[fname].append(Entity((start, end), "Species"))
 
@@ -5186,7 +5186,11 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # build dataset name and full huggingface reference name
-        if not dataset_name.startswith("bigbio/"):
+        if dataset_name.startswith("/"):  # Absolute path for local BigBio datasets
+            full_dataset_name = dataset_name
+            dataset_name = dataset_name.split("/")[-1]
+            dataset_name = dataset_name.split(".")[0]
+        elif not dataset_name.startswith("bigbio/"):
             full_dataset_name = "bigbio" + "/" + dataset_name
         else:
             full_dataset_name = dataset_name
@@ -5325,7 +5329,7 @@ def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]],
 
         else:
             # This should never happen :-D
-            return None
+            return -1, -1
 
 
 class HUNER_GENE_NLM_GENE(BIGBIO_NER_CORPUS):
@@ -5648,11 +5652,80 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_GENE_TMVAR_V3, self).__init__(
+            *args,
+            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
+            # dataset_name="tmvar_v3",
+            **kwargs,
+        )
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"Gene": GENE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+    # Some offsets are broken in tmvar_v3, we need to fix them
+    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
+        """
+        Converts a dataset given in hugging datasets format to our internal corpus representation.
+        """
+        # dataset = dataset.map(map_fn, batched=True)
+        return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split)
+
+
+class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_SPECIES_TMVAR_V3, self).__init__(
+            *args,
+            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
+            # dataset_name="tmvar_v3",
+            **kwargs,
+        )
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"['Species']": SPECIES_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+    # Some offsets are broken in tmvar_v3, we need to fix them
+    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
+        """
+        Converts a dataset given in hugging datasets format to our internal corpus representation.
+        """
+        # dataset = dataset.map(map_fn, batched=True)
+        return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split)
+
+
+class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(
+            *args,
+            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
+            # dataset_name="tmvar_v3",
+            **kwargs,
+        )
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"['CellLine']": CELL_LINE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+# Deprecated, is fixed in BigBio but useful code for debugging future issues
 # FIXME: Annotation missmatch from the source PubTator files
 # EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene  vs. Mention: DKFZ p686k1684
 def map_fn(example):
     example["entities"] = [
-        repair_doc_offsets(passages, entities) for passages, entities in zip(example["passages"], example["entities"])
+        repair_doc_offsets(passages, entities)
+        if passages[0]
+        == "Two novel mutations of the PAX6 gene causing different phenotype in a cohort of Chinese patients."
+        else entities
+        for passages, entities in zip(example["passages"], example["entities"])
     ]
     return example
 
@@ -5721,55 +5794,6 @@ def repair_doc_offsets(passages, entities):
     return doc_entities
 
 
-class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS):
-    def __init__(self, *args, **kwargs):
-        super(HUNER_GENE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
-
-    def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"['Gene']": GENE_TAG}
-
-    def build_corpus_directory_name(self, dataset_name: str) -> str:
-        return self.__class__.__name__.lower()
-
-    # Some offsets are broken in tmvar_v3, we need to fix them
-    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
-        """
-        Converts a dataset given in hugging datasets format to our internal corpus representation.
-        """
-        dataset = dataset.map(map_fn, batched=True)
-        return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split)
-
-
-class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS):
-    def __init__(self, *args, **kwargs):
-        super(HUNER_SPECIES_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
-
-    def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"['Species']": SPECIES_TAG}
-
-    def build_corpus_directory_name(self, dataset_name: str) -> str:
-        return self.__class__.__name__.lower()
-
-    # Some offsets are broken in tmvar_v3, we need to fix them
-    def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
-        """
-        Converts a dataset given in hugging datasets format to our internal corpus representation.
-        """
-        dataset = dataset.map(map_fn, batched=True)
-        return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split)
-
-
-# class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS):
-#     def __init__(self, *args, **kwargs):
-#         super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs)
-
-#     def get_entity_type_mapping(self) -> Optional[Dict]:
-#         return {"['CellLine']": CELL_LINE_TAG}
-
-#     def build_corpus_directory_name(self, dataset_name: str) -> str:
-#         return self.__class__.__name__.lower()
-
-
 # Already implemented earlier
 # class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
 #     def __init__(self, *args, **kwargs):

From 9ed972615fec2d013adf9fa804f95118b9cc1159 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Wed, 15 Mar 2023 15:37:00 +0100
Subject: [PATCH 07/12] Remove debugging code + fix local data set paths

---
 flair/datasets/biomedical.py | 109 +++--------------------------------
 1 file changed, 8 insertions(+), 101 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 21052a6445..f40e9dcd1d 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -5295,11 +5295,11 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
                 id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"]))
 
                 # FIXME: This is just for debugging purposes
-                passage_text = id_to_text[passage_id]
-                doc_text = passage_text[entity_offset[0] : entity_offset[1]]
-                mention_text = entity["text"][0]
-                if doc_text != mention_text:
-                    print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}")
+                # passage_text = id_to_text[passage_id]
+                # doc_text = passage_text[entity_offset[0] : entity_offset[1]]
+                # mention_text = entity["text"][0]
+                # if doc_text != mention_text:
+                #     print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}")
 
         return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities)
 
@@ -5656,8 +5656,7 @@ class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
         super(HUNER_GENE_TMVAR_V3, self).__init__(
             *args,
-            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
-            # dataset_name="tmvar_v3",
+            dataset_name="tmvar_v3",
             **kwargs,
         )
 
@@ -5680,8 +5679,7 @@ class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
         super(HUNER_SPECIES_TMVAR_V3, self).__init__(
             *args,
-            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
-            # dataset_name="tmvar_v3",
+            dataset_name="tmvar_v3",
             **kwargs,
         )
 
@@ -5704,8 +5702,7 @@ class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS):
     def __init__(self, *args, **kwargs):
         super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(
             *args,
-            dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py",
-            # dataset_name="tmvar_v3",
+            dataset_name="tmvar_v3",
             **kwargs,
         )
 
@@ -5714,93 +5711,3 @@ def get_entity_type_mapping(self) -> Optional[Dict]:
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
-
-
-# Deprecated, is fixed in BigBio but useful code for debugging future issues
-# FIXME: Annotation missmatch from the source PubTator files
-# EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene  vs. Mention: DKFZ p686k1684
-def map_fn(example):
-    example["entities"] = [
-        repair_doc_offsets(passages, entities)
-        if passages[0]
-        == "Two novel mutations of the PAX6 gene causing different phenotype in a cohort of Chinese patients."
-        else entities
-        for passages, entities in zip(example["passages"], example["entities"])
-    ]
-    return example
-
-
-def repair_doc_offsets(passages, entities):
-    """
-    Some offsets are broken in tmvar_v3, we need to fix them. Replace doc in-place.
-    """
-
-    text = " ".join([passage["text"][0] for passage in passages])
-
-    sentences = text.split(". ")
-
-    sentence_indexes = [m.start() + 2 for m in re.finditer("\. ", text)]  # because the suffix is ". "
-    sentence_indexes = [0] + sentence_indexes
-
-    doc_entities = entities
-
-    if len(doc_entities) == 0:
-        return
-
-    # doc_entities = dataset[split].filter(lambda x: x["document_id"] == "21904390")[:]["entities"][0]
-
-    # print(sentence_indexes)
-    # print(len(sentences))
-    # print(text)
-    # print(doc_entities)
-
-    sentence_index = 0
-    entity_index = 0
-    current_offset = 0
-    next_sentence_offset = 0
-    next_entity_offset = text[current_offset:].find(doc_entities[entity_index]["text"][0])
-    while True:
-        if sentence_index >= len(sentence_indexes) and entity_index >= len(doc_entities):
-            break
-        if next_sentence_offset <= next_entity_offset:
-            sentence_end = sentence_indexes[sentence_index] + len(sentences[sentence_index]) + 2
-            # print(f"Sentence {sentence_index} @ offsets {sentence_indexes[sentence_index]} to {sentence_end}")
-            # print(sentences[sentence_index] + ". ")
-            sentence_index += 1
-            if sentence_index >= len(sentence_indexes):
-                next_sentence_offset = len(text)
-            else:
-                next_sentence_offset = sentence_indexes[sentence_index]
-            # print(f"DEBUG next_sentence_offset: {next_sentence_offset}")
-        else:  # next_entity_offset < next_sentence_offset
-            entity = doc_entities[entity_index]
-            entity_name = entity["text"][0]
-            given_offset_start = entity["offsets"][0][0]
-            given_offset_end = entity["offsets"][0][1]
-            # print(f"  {entity_name} @ offsets (real) {next_entity_offset} to {next_entity_offset + len(entity_name)}")
-            # print(f"  {text[given_offset_start:given_offset_end]} @ offset (given) {given_offset_start} to {given_offset_end}")
-            if given_offset_start != next_entity_offset:  # Mismatched entities
-                # print(doc_entities)
-                entity["offsets"][0][0] = next_entity_offset
-                entity["offsets"][0][1] = next_entity_offset + len(entity_name)
-                # print(doc_entities)
-            current_offset = next_entity_offset + len(entity_name)
-            entity_index += 1
-            if entity_index >= len(doc_entities):
-                next_entity_offset = len(text)
-            else:
-                next_entity_offset = current_offset + text[current_offset:].find(doc_entities[entity_index]["text"][0])
-
-    return doc_entities
-
-
-# Already implemented earlier
-# class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS):
-#     def __init__(self, *args, **kwargs):
-#         super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs)
-
-#     def get_entity_type_mapping(self) -> Optional[Dict]:
-#         return {"Gene_or_gene_product": GENE_TAG}
-
-#     def build_corpus_directory_name(self, dataset_name: str) -> str:
-#         return self.__class__.__name__.lower()

From ebaa95f68ad3fb3bfe28c08dcbe4797960f9c372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Wed, 15 Mar 2023 15:49:05 +0100
Subject: [PATCH 08/12] Fix typo

---
 flair/datasets/biomedical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index f40e9dcd1d..06baa9b014 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -2770,7 +2770,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
                 start, end = int(fields[2]), int(fields[3])
 
                 if start == end:
-                    continue_S800
+                    continue # Illegal annotation
 
                 entities_per_document[fname].append(Entity((start, end), "Species"))
 

From e134ee37ea4d108b87156ef01c463355350947dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Mon, 3 Apr 2023 14:22:29 +0200
Subject: [PATCH 09/12] Add deprecated tag to data sets that are also available
 in BigBio

---
 flair/datasets/biomedical.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 06baa9b014..a50e11a505 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -8,6 +8,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
 from copy import copy
+from deprecated import deprecated
 from operator import attrgetter
 from pathlib import Path
 from tarfile import (
@@ -230,7 +231,6 @@ def bioc_to_internal(bioc_file: Path):
             document_text += " " + text
 
             for annotation in passage.xpath(".//annotation"):
-
                 entity_types = [
                     i.text.replace(" ", "_")
                     for i in annotation.xpath("./infon")
@@ -644,6 +644,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return merge_datasets([train_data, test_data])
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class JNLPBA(ColumnCorpus):
     """
     Original corpus of the JNLPBA shared task.
@@ -990,6 +991,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return data
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class MIRNA(ColumnCorpus):
     """
     Original miRNA corpus.
@@ -1617,6 +1619,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return filter_and_map_entities(dataset, {"protein": GENE_TAG})
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class CHEMDNER(ColumnCorpus):
     """
     Original corpus of the CHEMDNER shared task.
@@ -1720,6 +1723,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return all_data
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class IEPA(ColumnCorpus):
     """
     IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/
@@ -1835,6 +1839,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return merge_datasets([train_data, test_data])
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class LINNEAUS(ColumnCorpus):
     """
     Original LINNEAUS corpus containing species annotations.
@@ -1939,6 +1944,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return LINNEAUS.download_and_parse_dataset(data_dir)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class CDR(ColumnCorpus):
     """
     CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR-Corpus
@@ -2054,6 +2060,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return all_data
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class VARIOME(ColumnCorpus):
     """
     Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip
@@ -2213,6 +2220,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return all_data
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class NCBI_DISEASE(ColumnCorpus):
     """
     Original NCBI disease corpus containing disease annotations.
@@ -2468,6 +2476,7 @@ def parse_input_file(input_file: Path):
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class SCAI_CHEMICALS(ScaiCorpus):
     """
     Original SCAI chemicals corpus containing chemical annotations.
@@ -2496,6 +2505,7 @@ def perform_corpus_download(data_dir: Path) -> Path:
         return corpus_file
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class SCAI_DISEASE(ScaiCorpus):
     """
     Original SCAI disease corpus containing disease annotations.
@@ -2575,6 +2585,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return filter_and_map_entities(corpus, entity_mapping)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class OSIRIS(ColumnCorpus):
     """
     Original OSIRIS corpus containing variation and gene annotations.
@@ -2645,7 +2656,6 @@ def parse_dataset(cls, corpus_folder: Path, fix_annotation=True):
             file for file in os.listdir(str(corpus_folder)) if file.endswith(".txt") and not file.startswith("README")
         ]
         for text_file in input_files:
-
             with open(os.path.join(str(corpus_folder), text_file), encoding="utf8") as text_reader:
                 document_text = text_reader.read()
                 if not document_text:
@@ -2770,7 +2780,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
                 start, end = int(fields[2]), int(fields[3])
 
                 if start == end:
-                    continue # Illegal annotation
+                    continue  # Illegal annotation
 
                 entities_per_document[fname].append(Entity((start, end), "Species"))
 
@@ -3449,6 +3459,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class BC2GM(ColumnCorpus):
     """
     Original BioCreative-II-GM corpus containing gene annotations.
@@ -3754,6 +3765,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return filter_and_map_entities(dataset, entity_type_mapping)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class CHEBI(ColumnCorpus):
     """
     Original CHEBI corpus containing all annotations.
@@ -4018,6 +4030,7 @@ def parse_input_files(input_folder: Path) -> InternalBioNerDataset:
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class BIONLP2013_PC(BioNLPCorpus):
     """
     Corpus of the BioNLP'2013 Pathway Curation shared task
@@ -4060,6 +4073,7 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]:
         return train_folder, dev_folder, test_folder
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class BIONLP2013_CG(BioNLPCorpus):
     """
     Corpus of the BioNLP'2013 Cancer Genetics shared task
@@ -4088,6 +4102,7 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]:
         return train_folder, dev_folder, test_folder
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class ANAT_EM(ColumnCorpus):
     """
     Corpus for anatomical named entity mention recognition.
@@ -4967,6 +4982,7 @@ def parse_corpus(input_file: Path) -> InternalBioNerDataset:
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
+@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class PDR(ColumnCorpus):
     """
     Corpus of plant-disease relations from Kim et al., consisting of named entity annotations

From 8c3b16db60b9ebc7d57380113448bde3b1ef84ef Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Tue, 11 Apr 2023 22:48:13 +0200
Subject: [PATCH 10/12] Fix formatting

---
 flair/datasets/biomedical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index a50e11a505..15d7bcbe84 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -8,7 +8,6 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
 from copy import copy
-from deprecated import deprecated
 from operator import attrgetter
 from pathlib import Path
 from tarfile import (
@@ -23,6 +22,7 @@
 from zipfile import BadZipFile, LargeZipFile
 
 import ftfy
+from deprecated import deprecated
 from lxml import etree
 from lxml.etree import XMLSyntaxError
 

From abd4fc028d2e9873f4557305be4cbf016e2cf6c5 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Tue, 11 Apr 2023 23:21:53 +0200
Subject: [PATCH 11/12] Fix typing problems

---
 flair/datasets/biomedical.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 15d7bcbe84..efd1608c51 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -5178,9 +5178,9 @@ def __init__(
         base_path: Union[str, Path] = None,
         in_memory: bool = True,
         sentence_splitter: SentenceSplitter = None,
-        train_split_name: Union[str, bool] = None,
-        dev_split_name: Union[str, bool] = None,
-        test_split_name: Union[str, bool] = None,
+        train_split_name: Optional[str] = None,
+        dev_split_name: Optional[str] = None,
+        test_split_name: Optional[str] = None,
     ):
         """
         :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem)
@@ -5280,7 +5280,7 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
         Converts a dataset given in hugging datasets format to our internal corpus representation.
         """
         id_to_text = {}
-        id_to_entities = {}
+        id_to_entities: Dict[str, List] = {}
         for document in dataset[split]:
             document_id = document["document_id"]
             passage_offsets = []

From a503c5b0a7bc977374e6a9bfb72a327870e5feab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Wed, 12 Apr 2023 16:30:51 +0200
Subject: [PATCH 12/12] Revise BIGBIO_NER_CORPUS initialization: store conll
 files in separate directories per sentence splitter (configuration)

---
 flair/datasets/biomedical.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index efd1608c51..aa0f0c9335 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -5177,7 +5177,7 @@ def __init__(
         dataset_name: str,
         base_path: Union[str, Path] = None,
         in_memory: bool = True,
-        sentence_splitter: SentenceSplitter = None,
+        sentence_splitter: Optional[SentenceSplitter] = None,
         train_split_name: Optional[str] = None,
         dev_split_name: Optional[str] = None,
         test_split_name: Optional[str] = None,
@@ -5212,11 +5212,12 @@ def __init__(
             full_dataset_name = dataset_name
             dataset_name = dataset_name.replace("bigbio/", "")
 
+        self.sentence_splitter = sentence_splitter if sentence_splitter else SciSpacySentenceSplitter()
+
         dataset_dir_name = self.build_corpus_directory_name(dataset_name)
-        data_folder = base_path / dataset_dir_name
+        data_folder = base_path / dataset_dir_name / self.sentence_splitter.name
 
         train_file = data_folder / "train.conll"
-        # test_file = data_folder / "test.conll"
 
         # Download data if necessary
         # Some datasets in BigBio only have train or test splits, not both
@@ -5252,10 +5253,7 @@ def __init__(
             if type_mapping:
                 splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()}
 
-            if sentence_splitter is None:
-                sentence_splitter = SciSpacySentenceSplitter()
-
-            conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
+            conll_writer = CoNLLWriter(sentence_splitter=self.sentence_splitter)
             conll_writer.process_dataset(splits, data_folder)
 
         super(BIGBIO_NER_CORPUS, self).__init__(